Skip to content
Snippets Groups Projects
Commit 945ff7ca authored by mjbonifa's avatar mjbonifa
Browse files

fixed default output and error files

parent 24c5944e
No related branches found
No related tags found
No related merge requests found
......@@ -125,51 +125,53 @@ Phenotypes are defined in a JSON configuration file. The file describes how sour
An example concept set and code list for Abdominal Pain is show below:
```json
{
"concept_sets": {
"version": "3.2.10",
"omop": {
"vocabulary_id": "MELDB",
"vocabulary_name": "Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity",
"vocabulary_reference": "https://www.it-innovation.soton.ac.uk/projects/meldb"
},
"concept_set": [
{
"concept_set_name": "ABDO_PAIN",
"concept_set_status": "AGREED",
"metadata": {
"#": "18",
"CONCEPT DESCRIPTION": "Abdominal pain",
"CONCEPT TYPE": "Workload indicator (symptom)",
"DATE ADDED ": "2023-08-25",
"REQUEST REASON ": "Clinician SF - requested by email - symptom example from Qualitative Evidence Synthesis",
"SOURCE INFO": "YES",
"FUNCTION": "QUERY BY CODING LIST",
"FUNCTION.1": "https://clinicalcodes.rss.mhs.man.ac.uk/",
"CODING LIST": "https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/codes/ClinicalCodes.org%20from%20the%20University%20of%20Manchester/Symptom%20code%20lists/Abdominal%20pain/res176-abdominal-pain.csv ",
"NOTES": "2023-09-08: Clinical SF confirmed that the clinical view would be that this would need to be recurrent or persistent.",
}
},
}
"codes": [
{
"folder": "codes/ClinicalCodes.org from the University of Manchester",
"description": "SF's clinical codes - downloaded 16/11/23",
"files": [
{
"file": "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv",
"columns": {
"read2_code": "code",
"metadata": [
"description"
]
},
"concept_set": [
"ABDO_PAIN"
]
},
}
"concept_sets": {
"version": "3.2.10",
"omop": {
"vocabulary_id": "MELDB",
"vocabulary_name": "Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity",
"vocabulary_reference": "https://www.it-innovation.soton.ac.uk/projects/meldb"
},
"concept_set": [
{
"concept_set_name": "ABDO_PAIN",
"concept_set_status": "AGREED",
"metadata": {
"#": "18",
"CONCEPT DESCRIPTION": "Abdominal pain",
"CONCEPT TYPE": "Workload indicator (symptom)",
"DATE ADDED ": "2023-08-25",
"REQUEST REASON ": "Clinician SF - requested by email - symptom example from Qualitative Evidence Synthesis",
"SOURCE INFO": "YES",
"FUNCTION": "QUERY BY CODING LIST",
"FUNCTION.1": "https://clinicalcodes.rss.mhs.man.ac.uk/",
"CODING LIST": "https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/codes/ClinicalCodes.org%20from%20the%20University%20of%20Manchester/Symptom%20code%20lists/Abdominal%20pain/res176-abdominal-pain.csv ",
"NOTES": "2023-09-08: Clinical SF confirmed that the clinical view would be that this would need to be recurrent or persistent."
}
}
]
},
"codes": [
{
"folder": "clinical-codes-org",
"description": "SF's clinical codes - downloaded 16/11/23",
"files": [
{
"file": "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv",
"columns": {
"read2_code": "code",
"metadata": [
"description"
]
},
"concept_set": [
"ABDO_PAIN"
]
}
]
}
]
}
```
......
......@@ -4,6 +4,8 @@ import trud
import omop
import map
from pathlib import Path
def trud_install(args):
"""Handle the `trud install` command."""
print(f"Installing TRUD")
......@@ -31,7 +33,7 @@ def omop_delete(args):
def map_process(args):
"""Handle the `map process` command."""
print(f"Processing map with phenotype config file: {args.config_file}")
print(f"Output directory: {args.output_dir}")
print(f"Output directory: {args.output_file}")
print(f"Target coding format: {args.target_coding}")
if args.translate:
print("Translating code types.")
......@@ -51,8 +53,8 @@ def map_process(args):
args.target_coding,
args.translate,
args.verify,
args.error_log,
output_path="MELD_concepts_read.csv")
error_path=Path(args.error_log),
output_path=Path(args.output_file))
print(f"Phenotype processing completed")
......@@ -95,15 +97,15 @@ def main():
map_process_parser = map_subparsers.add_parser("process", help="Process map configuration file")
map_process_parser.add_argument("-c", "--config-file", required=True, help="Phenotype configuration file")
map_process_parser.add_argument("-s", "--source-codes-dir", required=True, help="Source codes root directory")
map_process_parser.add_argument("-o", "--output-dir", required=True, help="Output directory for CSV or OMOP database")
map_process_parser.add_argument("-t", "--target-coding", required=True, choices=['read2', 'read3', 'icd10', 'snomed', 'opcs4'], help="Specify the target coding (read2, read3, icd10, snomed, opcs4)")
map_process_parser.add_argument("-o", "--output-file", type=str, default=str(map.OUTPUT_PATH.resolve()), help="Output directory for CSV or OMOP database")
# Flags
map_process_parser.add_argument("-tr", "--translate", action="store_true", default=False, help="Do not translate code types")
map_process_parser.add_argument("-v", "--verify", action="store_true", default=False, help="Do not verify codes")
# Error log file
map_process_parser.add_argument("-l", "--error-log", type=str, default='error.csv', help="Filepath to save error log to")
map_process_parser.add_argument("-l", "--error-log", type=str, default=str(map.ERROR_PATH.resolve()), help="Filepath to save error log to")
# Set the function to call when 'process' subcommand is used
map_process_parser.set_defaults(func=map_process)
......
......@@ -24,6 +24,8 @@ from omop import setup
pd.set_option("mode.chained_assignment", None)
OUTPUT_PATH = Path('build') / 'phenotype_mapping.csv'
ERROR_PATH = Path('build') / 'errors.csv'
def read_table_file(path, excel_sheet=None):
"""
......@@ -166,8 +168,7 @@ def sql_row_exist(conn, table, column, value):
return exists
def process(config_file, source_codes_dir, target_code_type, translate=True, verify=True, log_errors_path="errors.csv", output_path="MELD_concepts_read.csv"):
def process(config_file, source_codes_dir, target_code_type, translate=True, verify=True, error_path=ERROR_PATH, output_path=OUTPUT_PATH):
config_path = Path(config_file)
if not config_path.is_file():
raise FileNotFoundError(f"Error: phenotype configuration file '{config_path}' does not exist.")
......@@ -196,27 +197,16 @@ def process(config_file, source_codes_dir, target_code_type, translate=True, ver
# Load Code File
if "excel_sheet" in file:
df = read_table_file(
path=file_path, excel_sheet=file["excel_sheet"]
)
df = read_table_file(path=file_path, excel_sheet=file["excel_sheet"])
else:
df = read_table_file(path=file_path)
# Perform Structural Changes to file before preprocessing
# split column with multiple code types
if (
"actions" in file
and "split_col" in file["actions"]
and "codes_col" in file["actions"]
):
if ("actions" in file and "split_col" in file["actions"] and "codes_col" in file["actions"]):
split_col = file["actions"]["split_col"]
codes_col = file["actions"]["codes_col"]
print(
"Action: Splitting",
split_col,
"column into:",
df[split_col].unique(),
)
print("Action: Splitting", split_col, "column into:", df[split_col].unique(),)
codes = df[codes_col]
oh = pd.get_dummies(df[split_col], dtype=bool) # one hot encode
oh = oh.where((oh != True), codes, axis=0) # fill in 1s with codes
......@@ -231,74 +221,36 @@ def process(config_file, source_codes_dir, target_code_type, translate=True, ver
# TODO: enable metacolumns to be outputted - problem with map_file appending
if "metadata" in file["columns"]:
meta_columns += file["columns"]["metadata"]
df = preprocess(
df,
file["columns"],
meta_columns=meta_columns,
file_path=file_path,
target_code_type=target_code_type,
verify=verify,
translate=translate,
)
df = preprocess(df, file["columns"], meta_columns=meta_columns, file_path=file_path, target_code_type=target_code_type, verify=verify, translate=translate)
else:
raise Exception("No column format provided")
# partition table by categorical column
if (
"actions" in file
and "divide_col" in file["actions"]
and len(df) > 0
):
if ("actions" in file and "divide_col" in file["actions"] and len(df) > 0):
divide_col = file["actions"]["divide_col"]
print(
"Action: Dividing Table by",
divide_col,
"column into: ",
df[divide_col].unique(),
)
print("Action: Dividing Table by", divide_col, "column into: ", df[divide_col].unique(),)
df = df.groupby(divide_col)
# Map to MELDB Concept/Phenotype
if len(df) == 0:
pass
# out = df
elif ("concept_set" in file) and isinstance(
df, pd.core.frame.DataFrame
):
out = map_file(
df,
target_code_type,
out,
concepts=file["concept_set"],
meta_columns=meta_columns,
translate=translate,
)
elif ("concept_set_categories" in file) and isinstance(
df, pd.core.groupby.generic.DataFrameGroupBy
):
elif ("concept_set" in file) and isinstance(df, pd.core.frame.DataFrame):
out = map_file(df, target_code_type, out, concepts=file["concept_set"], meta_columns=meta_columns, translate=translate,)
elif ("concept_set_categories" in file) and isinstance(df, pd.core.groupby.generic.DataFrameGroupBy):
meta_columns.remove(divide_col) # delete categorical column
for cat, grp in df:
if (
cat in file["concept_set_categories"].keys()
): # check if category is mapped
grp = grp.drop(
columns=[divide_col]
) # delete categorical column
if (cat in file["concept_set_categories"].keys()): # check if category is mapped
grp = grp.drop(columns=[divide_col]) # delete categorical column
print("Category:", cat)
out = map_file(
grp,
target_code_type,
out,
concepts=file["concept_set_categories"][cat],
meta_columns=meta_columns,
)
out = map_file(grp, target_code_type, out, concepts=file["concept_set_categories"][cat], meta_columns=meta_columns,)
else:
print("Folder is empty")
# check if out is empty
if len(out) <= 0:
raise Exception("Output file is empty")
raise Exception("Output dataframe is empty")
# Final Processing
out = out.reset_index(drop=True)
......@@ -340,11 +292,11 @@ def process(config_file, source_codes_dir, target_code_type, translate=True, ver
else:
# export as CSV to /output
out.to_csv(output_path, index=False)
print("saved to", output_path)
print("Saved to", output_path)
# Save Error File
if os.path.exists(log_errors_path):
error_df = pd.read_csv(log_errors_path)
if error_path.exists():
error_df = pd.read_csv(error_path)
error_df = error_df.drop_duplicates() # Remove Duplicates from Error file
error_df = error_df.sort_values(by=["SOURCE", "VOCABULARY", "CONCEPT"])
error_df.to_csv(log_errors_path, index=False)
error_df.to_csv(error_path, index=False)
......@@ -68,13 +68,12 @@ class Proto_code():
class Read2_code(Proto_code):
def __init__(self, file_path=None):
super().__init__(file_path)
input_path = trud.MAPS_PROCESSED_DIR / 'read2_code.parquet'
if not input_path.is_file():
raise FileNotFoundError(f"Error: Read2 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")
self.db = pd.read_parquet(input_path)
self.arg_small = "-r2"
self.arg_long = "--read2-code"
self.arg_help = "Read V2 Codes Column name in Source File"
self.checks = [
(
"Not Empty",
......@@ -115,9 +114,6 @@ class Read2_code(Proto_code):
class Read3_code(Proto_code):
def __init__(self, file_path=None):
super().__init__(file_path)
self.arg_small = "-r3"
self.arg_long = "--read3-code"
self.arg_help = "Read V3 Codes Column name in Source File"
input_path = trud.MAPS_PROCESSED_DIR / 'read3_code.parquet'
if not input_path.is_file():
......@@ -163,9 +159,6 @@ class Read3_code(Proto_code):
class Icd10_code(Proto_code):
def __init__(self, file_path=None):
super().__init__(file_path)
self.arg_small = "-i"
self.arg_long = "--icd10-code"
self.arg_help = "ICD10 Codes Column name in Source File"
input_path = trud.MAPS_PROCESSED_DIR / 'icd10_code.parquet'
if not input_path.is_file():
......@@ -229,9 +222,6 @@ class Icd10_code(Proto_code):
class Snomed_code(Proto_code):
def __init__(self, file_path=None):
super().__init__(file_path)
self.arg_small = "-s"
self.arg_long = "--snomed-code"
self.arg_help = "SNOMED Codes Column name in Source File"
input_path = trud.MAPS_PROCESSED_DIR / 'snomed_code.parquet'
if not input_path.is_file():
......@@ -289,9 +279,6 @@ class Snomed_code(Proto_code):
class Opcs4_code(Proto_code):
def __init__(self, file_path=None):
super().__init__(file_path)
self.arg_small = "-o"
self.arg_long = "--opcs4-code"
self.arg_help = "OPCS4 Codes Column name in Source File"
input_path = trud.MAPS_PROCESSED_DIR / 'opcs4_code.parquet'
if not input_path.is_file():
......@@ -317,9 +304,6 @@ class Opcs4_code(Proto_code):
class Atc_code(Proto_code):
def __init__(self, file_path=None):
super().__init__(file_path)
self.arg_small = "-a"
self.arg_long = "--atc-code"
self.arg_help = "ATC Codes Column name in Source File"
self.checks = [
(
"Not Empty",
......@@ -340,9 +324,6 @@ class Atc_code(Proto_code):
class Med_code(Proto_code):
def __init__(self, file_path=None):
super().__init__(file_path)
self.arg_small = "-m"
self.arg_long = "--med-code"
self.arg_help = "Med Codes Column name in Source File"
self.checks = [
(
"Not Empty",
......@@ -354,9 +335,6 @@ class Med_code(Proto_code):
class Cprd_code(Proto_code):
def __init__(self, file_path=None):
super().__init__(file_path)
self.arg_small = "-c"
self.arg_long = "--cprd-code"
self.arg_help = "CPRD Product Codes Column name in Source File"
self.checks = [
(
"Not Empty",
......@@ -366,14 +344,14 @@ class Cprd_code(Proto_code):
]
code_types = {
"read2_code": Read2_code,
"read3_code": Read3_code,
"icd10_code": Icd10_code,
"snomed_code": Snomed_code,
"opcs4_code": Opcs4_code,
"atc_code": Atc_code,
"med_code": Med_code,
"cprd_code": Cprd_code,
"read2": Read2_code,
"read3": Read3_code,
"icd10": Icd10_code,
"snomed": Snomed_code,
"opcs4": Opcs4_code,
"atc": Atc_code,
"med": Med_code,
"cprd": Cprd_code,
}
vocab_types = {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment