From 945ff7caa9c45bbe69b1478ec02cd25cc370e8b3 Mon Sep 17 00:00:00 2001 From: Michael Boniface <m.j.boniface@soton.ac.uk> Date: Sat, 15 Feb 2025 17:27:34 +0000 Subject: [PATCH] fixed default output and error files --- README.md | 90 ++++++++++++++++++++++++++++--------------------------- acmc.py | 12 ++++---- map.py | 88 +++++++++++++---------------------------------------- parse.py | 42 +++++++------------------- 4 files changed, 83 insertions(+), 149 deletions(-) diff --git a/README.md b/README.md index 35c3796..c2dd4da 100644 --- a/README.md +++ b/README.md @@ -125,51 +125,53 @@ Phenotypes are defined in a JSON configuration file. The file describes how sour An example concept set and code list for Abdominal Pain is show below: ```json - { - "concept_sets": { - "version": "3.2.10", - "omop": { - "vocabulary_id": "MELDB", - "vocabulary_name": "Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity", - "vocabulary_reference": "https://www.it-innovation.soton.ac.uk/projects/meldb" - }, - "concept_set": [ - { - "concept_set_name": "ABDO_PAIN", - "concept_set_status": "AGREED", - "metadata": { - "#": "18", - "CONCEPT DESCRIPTION": "Abdominal pain", - "CONCEPT TYPE": "Workload indicator (symptom)", - "DATE ADDED ": "2023-08-25", - "REQUEST REASON ": "Clinician SF - requested by email - symptom example from Qualitative Evidence Synthesis", - "SOURCE INFO": "YES", - "FUNCTION": "QUERY BY CODING LIST", - "FUNCTION.1": "https://clinicalcodes.rss.mhs.man.ac.uk/", - "CODING LIST": "https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/codes/ClinicalCodes.org%20from%20the%20University%20of%20Manchester/Symptom%20code%20lists/Abdominal%20pain/res176-abdominal-pain.csv ", - "NOTES": "2023-09-08: Clinical SF confirmed that the clinical view would be that this would need to be recurrent or persistent.", - } - }, - } - "codes": [ - { - "folder": "codes/ClinicalCodes.org from the University of Manchester", - "description": "SF's clinical codes - downloaded 16/11/23", - "files": [ - { - "file": "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv", - "columns": { - "read2_code": "code", - "metadata": [ - "description" - ] - }, - "concept_set": [ - "ABDO_PAIN" - ] - }, - } + "concept_sets": { + "version": "3.2.10", + "omop": { + "vocabulary_id": "MELDB", + "vocabulary_name": "Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity", + "vocabulary_reference": "https://www.it-innovation.soton.ac.uk/projects/meldb" + }, + "concept_set": [ + { + "concept_set_name": "ABDO_PAIN", + "concept_set_status": "AGREED", + "metadata": { + "#": "18", + "CONCEPT DESCRIPTION": "Abdominal pain", + "CONCEPT TYPE": "Workload indicator (symptom)", + "DATE ADDED ": "2023-08-25", + "REQUEST REASON ": "Clinician SF - requested by email - symptom example from Qualitative Evidence Synthesis", + "SOURCE INFO": "YES", + "FUNCTION": "QUERY BY CODING LIST", + "FUNCTION.1": "https://clinicalcodes.rss.mhs.man.ac.uk/", + "CODING LIST": "https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/codes/ClinicalCodes.org%20from%20the%20University%20of%20Manchester/Symptom%20code%20lists/Abdominal%20pain/res176-abdominal-pain.csv ", + "NOTES": "2023-09-08: Clinical SF confirmed that the clinical view would be that this would need to be recurrent or persistent." + } + } + ] + }, + "codes": [ + { + "folder": "clinical-codes-org", + "description": "SF's clinical codes - downloaded 16/11/23", + "files": [ + { + "file": "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv", + "columns": { + "read2_code": "code", + "metadata": [ + "description" + ] + }, + "concept_set": [ + "ABDO_PAIN" + ] + } + ] + } + ] } ``` diff --git a/acmc.py b/acmc.py index f7c5817..ae4b051 100644 --- a/acmc.py +++ b/acmc.py @@ -4,6 +4,8 @@ import trud import omop import map +from pathlib import Path + def trud_install(args): """Handle the `trud install` command.""" print(f"Installing TRUD") @@ -31,7 +33,7 @@ def omop_delete(args): def map_process(args): """Handle the `map process` command.""" print(f"Processing map with phenotype config file: {args.config_file}") - print(f"Output directory: {args.output_dir}") + print(f"Output directory: {args.output_file}") print(f"Target coding format: {args.target_coding}") if args.translate: print("Translating code types.") @@ -51,8 +53,8 @@ def map_process(args): args.target_coding, args.translate, args.verify, - args.error_log, - output_path="MELD_concepts_read.csv") + error_path=Path(args.error_log), + output_path=Path(args.output_file)) print(f"Phenotype processing completed") @@ -95,15 +97,15 @@ def main(): map_process_parser = map_subparsers.add_parser("process", help="Process map configuration file") map_process_parser.add_argument("-c", "--config-file", required=True, help="Phenotype configuration file") map_process_parser.add_argument("-s", "--source-codes-dir", required=True, help="Source codes root directory") - map_process_parser.add_argument("-o", "--output-dir", required=True, help="Output directory for CSV or OMOP database") map_process_parser.add_argument("-t", "--target-coding", required=True, choices=['read2', 'read3', 'icd10', 'snomed', 'opcs4'], help="Specify the target coding (read2, read3, icd10, snomed, opcs4)") + map_process_parser.add_argument("-o", "--output-file", type=str, default=str(map.OUTPUT_PATH.resolve()), help="Output directory for CSV or OMOP database") # Flags map_process_parser.add_argument("-tr", "--translate", action="store_true", default=False, help="Do not translate code types") map_process_parser.add_argument("-v", "--verify", action="store_true", default=False, help="Do not verify codes") # Error log file - map_process_parser.add_argument("-l", "--error-log", type=str, default='error.csv', help="Filepath to save error log to") + map_process_parser.add_argument("-l", "--error-log", type=str, default=str(map.ERROR_PATH.resolve()), help="Filepath to save error log to") # Set the function to call when 'process' subcommand is used map_process_parser.set_defaults(func=map_process) diff --git a/map.py b/map.py index c468dfa..5456de7 100644 --- a/map.py +++ b/map.py @@ -24,6 +24,8 @@ from omop import setup pd.set_option("mode.chained_assignment", None) +OUTPUT_PATH = Path('build') / 'phenotype_mapping.csv' +ERROR_PATH = Path('build') / 'errors.csv' def read_table_file(path, excel_sheet=None): """ @@ -166,8 +168,7 @@ def sql_row_exist(conn, table, column, value): return exists - -def process(config_file, source_codes_dir, target_code_type, translate=True, verify=True, log_errors_path="errors.csv", output_path="MELD_concepts_read.csv"): +def process(config_file, source_codes_dir, target_code_type, translate=True, verify=True, error_path=ERROR_PATH, output_path=OUTPUT_PATH): config_path = Path(config_file) if not config_path.is_file(): raise FileNotFoundError(f"Error: phenotype configuration file '{config_path}' does not exist.") @@ -196,27 +197,16 @@ def process(config_file, source_codes_dir, target_code_type, translate=True, ver # Load Code File if "excel_sheet" in file: - df = read_table_file( - path=file_path, excel_sheet=file["excel_sheet"] - ) + df = read_table_file(path=file_path, excel_sheet=file["excel_sheet"]) else: df = read_table_file(path=file_path) # Perform Structural Changes to file before preprocessing # split column with multiple code types - if ( - "actions" in file - and "split_col" in file["actions"] - and "codes_col" in file["actions"] - ): + if ("actions" in file and "split_col" in file["actions"] and "codes_col" in file["actions"]): split_col = file["actions"]["split_col"] codes_col = file["actions"]["codes_col"] - print( - "Action: Splitting", - split_col, - "column into:", - df[split_col].unique(), - ) + print("Action: Splitting", split_col, "column into:", df[split_col].unique(),) codes = df[codes_col] oh = pd.get_dummies(df[split_col], dtype=bool) # one hot encode oh = oh.where((oh != True), codes, axis=0) # fill in 1s with codes @@ -231,74 +221,36 @@ def process(config_file, source_codes_dir, target_code_type, translate=True, ver # TODO: enable metacolumns to be outputted - problem with map_file appending if "metadata" in file["columns"]: meta_columns += file["columns"]["metadata"] - df = preprocess( - df, - file["columns"], - meta_columns=meta_columns, - file_path=file_path, - target_code_type=target_code_type, - verify=verify, - translate=translate, - ) + df = preprocess(df, file["columns"], meta_columns=meta_columns, file_path=file_path, target_code_type=target_code_type, verify=verify, translate=translate) else: raise Exception("No column format provided") # partition table by categorical column - if ( - "actions" in file - and "divide_col" in file["actions"] - and len(df) > 0 - ): + if ("actions" in file and "divide_col" in file["actions"] and len(df) > 0): divide_col = file["actions"]["divide_col"] - print( - "Action: Dividing Table by", - divide_col, - "column into: ", - df[divide_col].unique(), - ) + print("Action: Dividing Table by", divide_col, "column into: ", df[divide_col].unique(),) df = df.groupby(divide_col) # Map to MELDB Concept/Phenotype if len(df) == 0: pass # out = df - elif ("concept_set" in file) and isinstance( - df, pd.core.frame.DataFrame - ): - out = map_file( - df, - target_code_type, - out, - concepts=file["concept_set"], - meta_columns=meta_columns, - translate=translate, - ) - elif ("concept_set_categories" in file) and isinstance( - df, pd.core.groupby.generic.DataFrameGroupBy - ): + elif ("concept_set" in file) and isinstance(df, pd.core.frame.DataFrame): + out = map_file(df, target_code_type, out, concepts=file["concept_set"], meta_columns=meta_columns, translate=translate,) + elif ("concept_set_categories" in file) and isinstance(df, pd.core.groupby.generic.DataFrameGroupBy): meta_columns.remove(divide_col) # delete categorical column for cat, grp in df: - if ( - cat in file["concept_set_categories"].keys() - ): # check if category is mapped - grp = grp.drop( - columns=[divide_col] - ) # delete categorical column + if (cat in file["concept_set_categories"].keys()): # check if category is mapped + grp = grp.drop(columns=[divide_col]) # delete categorical column print("Category:", cat) - out = map_file( - grp, - target_code_type, - out, - concepts=file["concept_set_categories"][cat], - meta_columns=meta_columns, - ) + out = map_file(grp, target_code_type, out, concepts=file["concept_set_categories"][cat], meta_columns=meta_columns,) else: print("Folder is empty") # check if out is empty if len(out) <= 0: - raise Exception("Output file is empty") + raise Exception("Output dataframe is empty") # Final Processing out = out.reset_index(drop=True) @@ -340,11 +292,11 @@ def process(config_file, source_codes_dir, target_code_type, translate=True, ver else: # export as CSV to /output out.to_csv(output_path, index=False) - print("saved to", output_path) + print("Saved to", output_path) # Save Error File - if os.path.exists(log_errors_path): - error_df = pd.read_csv(log_errors_path) + if error_path.exists(): + error_df = pd.read_csv(error_path) error_df = error_df.drop_duplicates() # Remove Duplicates from Error file error_df = error_df.sort_values(by=["SOURCE", "VOCABULARY", "CONCEPT"]) - error_df.to_csv(log_errors_path, index=False) + error_df.to_csv(error_path, index=False) diff --git a/parse.py b/parse.py index 33907e6..ddf0748 100644 --- a/parse.py +++ b/parse.py @@ -68,13 +68,12 @@ class Proto_code(): class Read2_code(Proto_code): def __init__(self, file_path=None): super().__init__(file_path) + input_path = trud.MAPS_PROCESSED_DIR / 'read2_code.parquet' if not input_path.is_file(): raise FileNotFoundError(f"Error: Read2 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") self.db = pd.read_parquet(input_path) - self.arg_small = "-r2" - self.arg_long = "--read2-code" - self.arg_help = "Read V2 Codes Column name in Source File" + self.checks = [ ( "Not Empty", @@ -115,9 +114,6 @@ class Read2_code(Proto_code): class Read3_code(Proto_code): def __init__(self, file_path=None): super().__init__(file_path) - self.arg_small = "-r3" - self.arg_long = "--read3-code" - self.arg_help = "Read V3 Codes Column name in Source File" input_path = trud.MAPS_PROCESSED_DIR / 'read3_code.parquet' if not input_path.is_file(): @@ -163,9 +159,6 @@ class Read3_code(Proto_code): class Icd10_code(Proto_code): def __init__(self, file_path=None): super().__init__(file_path) - self.arg_small = "-i" - self.arg_long = "--icd10-code" - self.arg_help = "ICD10 Codes Column name in Source File" input_path = trud.MAPS_PROCESSED_DIR / 'icd10_code.parquet' if not input_path.is_file(): @@ -229,9 +222,6 @@ class Icd10_code(Proto_code): class Snomed_code(Proto_code): def __init__(self, file_path=None): super().__init__(file_path) - self.arg_small = "-s" - self.arg_long = "--snomed-code" - self.arg_help = "SNOMED Codes Column name in Source File" input_path = trud.MAPS_PROCESSED_DIR / 'snomed_code.parquet' if not input_path.is_file(): @@ -289,9 +279,6 @@ class Snomed_code(Proto_code): class Opcs4_code(Proto_code): def __init__(self, file_path=None): super().__init__(file_path) - self.arg_small = "-o" - self.arg_long = "--opcs4-code" - self.arg_help = "OPCS4 Codes Column name in Source File" input_path = trud.MAPS_PROCESSED_DIR / 'opcs4_code.parquet' if not input_path.is_file(): @@ -317,9 +304,6 @@ class Opcs4_code(Proto_code): class Atc_code(Proto_code): def __init__(self, file_path=None): super().__init__(file_path) - self.arg_small = "-a" - self.arg_long = "--atc-code" - self.arg_help = "ATC Codes Column name in Source File" self.checks = [ ( "Not Empty", @@ -340,9 +324,6 @@ class Atc_code(Proto_code): class Med_code(Proto_code): def __init__(self, file_path=None): super().__init__(file_path) - self.arg_small = "-m" - self.arg_long = "--med-code" - self.arg_help = "Med Codes Column name in Source File" self.checks = [ ( "Not Empty", @@ -354,9 +335,6 @@ class Med_code(Proto_code): class Cprd_code(Proto_code): def __init__(self, file_path=None): super().__init__(file_path) - self.arg_small = "-c" - self.arg_long = "--cprd-code" - self.arg_help = "CPRD Product Codes Column name in Source File" self.checks = [ ( "Not Empty", @@ -366,14 +344,14 @@ class Cprd_code(Proto_code): ] code_types = { - "read2_code": Read2_code, - "read3_code": Read3_code, - "icd10_code": Icd10_code, - "snomed_code": Snomed_code, - "opcs4_code": Opcs4_code, - "atc_code": Atc_code, - "med_code": Med_code, - "cprd_code": Cprd_code, + "read2": Read2_code, + "read3": Read3_code, + "icd10": Icd10_code, + "snomed": Snomed_code, + "opcs4": Opcs4_code, + "atc": Atc_code, + "med": Med_code, + "cprd": Cprd_code, } vocab_types = { -- GitLab