diff --git a/acmc/phen.py b/acmc/phen.py index 4b22e84ed1e553bffdd6f3426a44160538080b8f..43ad09eee7db27f748bfec0b3128763b73a45a75 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -42,6 +42,7 @@ CODES_COL_ACTION = "codes_col" DIVIDE_COL_ACTION = "divide_col" COL_ACTIONS = [SPLIT_COL_ACTION, CODES_COL_ACTION, DIVIDE_COL_ACTION] +CODE_FILE_TYPES = [".xlsx", ".xls", ".csv"] class PhenValidationException(Exception): """Custom exception class raised when validation errors in phenotype configuration file""" @@ -190,16 +191,15 @@ def init(phen_dir, remote_url): # create empty phen config file config = { - "concept_sets": { + "phenotype": { "version": initial_version, "omop": { "vocabulary_id": "", "vocabulary_name": "", "vocabulary_reference": "", }, - "concept_set": [], - }, - "codes": [], + "concept_sets": [], + } } with open(phen_path / CONFIG_FILE, "w") as file: @@ -257,7 +257,7 @@ def validate(phen_dir): # Load configuration File if config_path.suffix == ".yaml": with config_path.open("r") as file: - mapping = yaml.safe_load(file) + phenotype = yaml.safe_load(file) else: raise Exception( f"Unsupported configuration filetype: {str(config_path.resolve())}" @@ -265,104 +265,70 @@ def validate(phen_dir): # initiatise validation_errors = [] - concept_sets = mapping["concept_sets"] - concept_codes = mapping["codes"] + phenotype = phenotype["phenotype"] code_types = parse.CodeTypeParser().code_types # check the version number is of the format vn.n.n - match = re.match(r"v(\d+\.\d+\.\d+)", concept_sets["version"]) + match = re.match(r"v(\d+\.\d+\.\d+)", phenotype["version"]) if not match: validation_errors.append( - f"Invalid version format in configuration file: {concept_sets['version']}" + f"Invalid version format in configuration file: {phenotype['version']}" ) # create a list of all the concept set names defined in the concept set configuration concept_set_names = [] - for item in concept_sets["concept_set"]: - if item["concept_set_name"] in concept_set_names: + for item in phenotype["concept_sets"]: + if item["name"] in concept_set_names: validation_errors.append( - f"Duplicate concept set defined in concept sets {item['concept_set_name'] }" + f"Duplicate concept set defined in concept sets {item['name'] }" ) else: - concept_set_names.append(item["concept_set_name"]) + concept_set_names.append(item["name"]) + # TODO: change this to some sort of yaml schema validation + required_keys = {"name", "file", "metadata"} + # check codes definition - concept_set_mapping_names = [] - for item in concept_codes: + for item in phenotype["concept_sets"]: - required_keys = {"folder", "files"} if required_keys.issubset(item.keys()): - # check concept codes path is a directory - concept_code_dir_path = codes_path / item["folder"] - if not concept_code_dir_path.is_dir(): + + # check concepte code file exists + concept_code_file_path = codes_path / item["file"]["path"] + if not concept_code_file_path.exists(): validation_errors.append( - f"Folder directory {str(concept_code_dir_path.resolve())} is not a directory" + f"Coding file {str(concept_code_file_path.resolve())} does not exist" ) - for file in item["files"]: - # check concepte code file exists - concept_code_file_path = concept_code_dir_path / file["file"] - if not concept_code_file_path.exists(): - validation_errors.append( - f"Coding file {str(concept_code_file_path.resolve())} does not exist" - ) - - # check concepte code file is not empty - concept_code_file_path = concept_code_dir_path / file["file"] - if concept_code_file_path.stat().st_size == 0: - validation_errors.append( - f"Coding file {str(concept_code_file_path.resolve())} is an empty file" - ) + # check concepte code file is not empty + if concept_code_file_path.stat().st_size == 0: + validation_errors.append( + f"Coding file {str(concept_code_file_path.resolve())} is an empty file" + ) - # check columns section exists - if "columns" not in file: + # check code file type is supported + if concept_code_file_path.suffix not in CODE_FILE_TYPES: + raise ValueError(f"Unsupported filetype {concept_code_file_path.suffix}, only support csv, xlsx, xls code file types") + + # check columns specified are a supported medical coding type + for column in item["file"]["columns"]: + if column not in code_types: validation_errors.append( - f"Columns not defined for {concept_code_file_path}" + f"Column type {column} for file {concept_code_file_path} is not supported" ) - # check columns specified are a supported medical coding type - for column in file["columns"]: - if column not in code_types and column != "metadata": + # check the actions are supported + if "actions" in item["file"]: + for action in item["file"]["actions"]: + if action not in COL_ACTIONS: validation_errors.append( - f"Column type {column} for file {concept_code_file_path} is not supported" + f"Action {action} is not supported" ) - # check the actions are supported - if "actions" in file: - for action in file["actions"]: - if action not in COL_ACTIONS: - validation_errors.append( - f"Action {action} is not supported" - ) - - # check concept_set defined for the mapping - logger.debug(f"file {file}") - for concept_set_mapping in file["concept_set"]: - # store the concept set names found for later set operations - logger.debug(f"mapping {concept_set_mapping}") - if concept_set_mapping['name'] not in concept_set_mapping_names: - concept_set_mapping_names.append(concept_set_mapping['name']) else: validation_errors.append( - f"Missing required elements {required_keys} in codes {item}" + f"Missing required elements {required_keys} in concept set {item}" ) - # create sets to perform set operations on the lists of concept set names - concept_set_names_set = set(concept_set_names) - concept_set_mapping_names_set = set(concept_set_mapping_names) - - # check all concept sets in the summary section have at least one code mapping - concept_set_no_codes = list(concept_set_names_set - concept_set_mapping_names_set) - if len(concept_set_no_codes) > 0: - validation_errors.append( - f"Concept sets do not exist in codes {concept_set_no_codes}" - ) - - # check all concept sets included in the code mapping are defined in the summary concept_set section - codes_no_concept_set = list(concept_set_mapping_names_set - concept_set_names_set) - if len(codes_no_concept_set) > 0: - validation_errors.append( - f"Concept sets mapped in codes do not exist in the concept sets: {codes_no_concept_set}" - ) if len(validation_errors) > 0: logger.error(validation_errors) @@ -378,9 +344,11 @@ def read_table_file(path, excel_sheet=None): """ Load Code List File """ + + path = path.resolve() if path.suffix == ".csv": df = pd.read_csv(path, dtype=str) - elif path.suffix == ".xlsx": + elif path.suffix == ".xlsx" or path.suffix == ".xls": if excel_sheet: df = pd.read_excel(path, sheet_name=excel_sheet, dtype=str) else: @@ -388,21 +356,21 @@ def read_table_file(path, excel_sheet=None): elif path.suffix == ".dta": df = pd.read_stata(path, dtype=str) else: - raise Exception(f"Unsupported filetype provided for source file {path.suffix}") + raise ValueError(f"Unsupported filetype {codes_file_path.suffix}, only support{CODE_FILE_TYPES} code file types") return df -def process_actions(df, file): +def process_actions(df, concept_set): # Perform Structural Changes to file before preprocessing logger.debug("Processing file structural actions") if ( - "actions" in file - and "split_col" in file["actions"] - and "codes_col" in file["actions"] + "actions" in concept_set["file"] + and "split_col" in concept_set["file"]["actions"] + and "codes_col" in concept_set["file"]["actions"] ): - split_col = file["actions"]["split_col"] - codes_col = file["actions"]["codes_col"] + split_col = concept_set["file"]["actions"]["split_col"] + codes_col = concept_set["file"]["actions"]["codes_col"] logger.debug( "Action: Splitting", split_col, @@ -419,33 +387,33 @@ def process_actions(df, file): # Perform QA Checks on columns individually and append to df -def preprocess_codes(df, file, target_code_type=None, codes_file=None): +def preprocess_codes(df, concept_set, code_file_path, target_code_type=None): """Parses each column individually - Order and length will not be preserved!""" out = pd.DataFrame([]) # create output df to append to code_errors = [] # list of errors from processing + metadata_df = pd.DataFrame() meta_columns = [] # meta columns to keep with codes - if "actions" in file and "divide_col" in file["actions"]: - meta_columns += [file["actions"]["divide_col"]] + if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"]: + meta_columns += [concept_set["file"]["actions"]["divide_col"]] + metadata_df = df[meta_columns] # TODO: enable metacolumns to be outputted - problem with map_file appending - if "metadata" in file["columns"]: - meta_columns += file["columns"]["metadata"] - - metadata_df = df[meta_columns] + # if "metadata" in file["columns"]: + # meta_columns += file["columns"]["metadata"] # Preprocess codes code_types = parse.CodeTypeParser().code_types for code_type_name, code_type_parser in code_types.items(): - if code_type_name in file["columns"]: + if code_type_name in concept_set["file"]["columns"]: logger.info(f"Processing {code_type_name} codes...") # get code types - codes = df[file["columns"][code_type_name]].dropna() + codes = df[concept_set["file"]["columns"][code_type_name]].dropna() codes = codes.astype(str) # convert to string codes = codes.str.strip() # remove excess spaces # process codes, validating them using parser and returning the errors - codes, errors = code_type_parser.process(codes, codes_file) + codes, errors = code_type_parser.process(codes, code_file_path) if len(errors) > 0: code_errors.extend(errors) logger.warning(f"Codes validation failed with {len(errors)} errors") @@ -491,10 +459,7 @@ def translate_codes(df, target_code_type): # Append file's codes to output Df with concept -def map_file(df, target_code_type, out, concept_names, meta_columns=[]): - # seperate out meta_columns - metadata_df = df[meta_columns] - df = df.drop(columns=meta_columns) +def map_file(df, target_code_type, out, concept_name): # translate codes codes = translate_codes(df, target_code_type) @@ -503,10 +468,8 @@ def map_file(df, target_code_type, out, concept_names, meta_columns=[]): # Append to output if translated if len(codes) > 0: codes = pd.DataFrame({"CONCEPT": codes}) - codes = codes.join(metadata_df) - for concept in concept_names: - codes["CONCEPT_SET"] = np.repeat(concept.strip(), len(codes)) - out = pd.concat([out, codes]) + codes["CONCEPT_SET"] = np.repeat(concept_name.strip(), len(codes)) + out = pd.concat([out, codes]) else: logger.debug(f"No codes converted with target code type {target_code_type}") @@ -588,86 +551,73 @@ def map(phen_dir, target_code_type): # load configuration with config_path.open("r") as file: config = yaml.safe_load(file) - - concept_sets = config["concept_sets"] - codes = config["codes"] + phenotype = config["phenotype"] # Create output dataframe out = pd.DataFrame([]) code_errors = [] # Process each folder in codes section - for folder in codes: - for file in folder["files"]: - logger.debug(f"--- {file['file']} ---") - codes_file_path = codes_path / folder["folder"] / file["file"] - - # Load Code File - if "excel_sheet" in file: - df = read_table_file( - path=codes_file_path, excel_sheet=file["excel_sheet"] - ) - else: - df = read_table_file(path=codes_file_path) - - # process structural actions - df = process_actions(df, file) + for concept_set in phenotype["concept_sets"]: + logger.debug(f"--- {concept_set['file']} ---") + + # Load Code File + codes_file_path = Path(codes_path / concept_set["file"]["path"]) + df = read_table_file(codes_file_path) + + # process structural actions + df = process_actions(df, concept_set) + + # Preprocessing & Validation Checks + logger.debug("Processing and validating code formats") + df, meta_columns, errors = preprocess_codes( + df, + concept_set, + codes_file_path, + target_code_type=target_code_type, + ) - # Preprocessing & Validation Checks - logger.debug("Processing and validating code formats") - df, meta_columns, errors = preprocess_codes( + logger.debug(f"Length of errors from preprocess {len(errors)}") + if len(errors) > 0: + code_errors.extend(errors) + logger.debug(f" Length of code_errors {len(code_errors)}") + + # partition table by categorical column + if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"] and len(df) > 0: + divide_col = concept_set["file"]["actions"]["divide_col"] + logger.debug(f"Action: Dividing Table by {divide_col}") + logger.debug(df.head()) + logger.debug(f"column into: {df[divide_col].unique()}") + df = df.groupby(divide_col) + + # Map to Concept/Phenotype + # TODO: This code needs refactory as it seems handling of the concept_set_categories should happen at another place + logger.debug(f"instance of df before if: {type(df)}") + if isinstance(df, pd.core.frame.DataFrame): + out = map_file( df, - file, - codes_file=str(codes_file_path.resolve()), - target_code_type=target_code_type, + target_code_type, + out, + concept_name=concept_set['name'] ) - - logger.debug(f" Length of errors from preprocess {len(errors)}") - if len(errors) > 0: - code_errors.extend(errors) - logger.debug(f" Length of code_errors {len(code_errors)}") - - # partition table by categorical column - if "actions" in file and "divide_col" in file["actions"] and len(df) > 0: - divide_col = file["actions"]["divide_col"] - logger.debug(f"Action: Dividing Table by {divide_col} column into: {df[divide_col].unique()}") - df = df.groupby(divide_col) - - # Map to Concept/Phenotype - # TODO: This code needs refactory as it seems handling of the concept_set_categories should happen at another place - logger.debug(f"instance of df before if: {type(df)}") - if isinstance(df, pd.core.frame.DataFrame): - concept_names = [concept['name'] for concept in file["concept_set"]] - out = map_file( - df, - target_code_type, - out, - concept_names=concept_names, - meta_columns=meta_columns, - ) - elif isinstance(df, pd.core.groupby.generic.DataFrameGroupBy): - meta_columns.remove(divide_col) # delete categorical column - for cat, grp in df: - for concept_set in file['concept_set']: - # what if there's no category, there's going to be an error - if cat == concept_set["category"]: - grp = grp.drop( - columns=[divide_col] - ) # delete categorical column - logger.debug(f"Mapping category: {cat}") - concept_names = [concept_set["name"]] - out = map_file( - grp, - target_code_type, - out, - concept_names=concept_names, - meta_columns=meta_columns, - ) - else: - logger.debug(f"instance of df: {type(df)}") - # raise AttributeError( - # f"File {file} has either no concept_set or conceot_set_categories or the instance of dataframe objectives associated is incorrect, concept_set must be a DataFrame, conceot_set_categories must be pd.core.groupby.generic.DataFrameGroupBy" - # ) + elif isinstance(df, pd.core.groupby.generic.DataFrameGroupBy): + for cat, grp in df: + # what if there's no category, there's going to be an error + if cat == concept_set["file"]["category"]: + grp = grp.drop( + columns=[divide_col] + ) # delete categorical column + out = map_file( + grp, + target_code_type, + out, + concept_name=concept_set['name'] + ) + else: + logger.debug(f"instance of df: {type(df)}") + # raise AttributeError( + # f"File {file} has either no concept_set or conceot_set_categories or the instance of dataframe objectives associated is incorrect, concept_set must be a DataFrame, conceot_set_categories must be pd.core.groupby.generic.DataFrameGroupBy" + # ) if len(code_errors) > 0: logger.error(f"The map processing has {len(code_errors)} errors") @@ -682,26 +632,12 @@ def map(phen_dir, target_code_type): raise Exception( f"No output after map processing, check config {str(config_path.resolve())}" ) + # Final processing out = out.reset_index(drop=True) out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"]) out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"]) - # Add concept set definition metadata - concept_sets_df = pd.DataFrame( - concept_sets["concept_set"] - ) # transform to dataframe - if "metadata" in concept_sets_df.columns: - concept_sets_df = concept_sets_df.join( - pd.json_normalize(concept_sets_df["metadata"]) - ) # metadata to columns - concept_sets_df = concept_sets_df.drop(columns=["metadata"]) - concept_sets_df = concept_sets_df.rename( - columns={"concept_set_name": "CONCEPT_SET"} - ) - concept_sets_df = concept_sets_df.drop_duplicates() # remove duplicates - out = out.merge(concept_sets_df, how="left", on="CONCEPT_SET") # merge with output - # Save output to map directory output_filename = target_code_type + ".csv" map_path = phen_path / MAP_DIR / output_filename @@ -762,7 +698,7 @@ def publish(phen_dir): config_path = phen_path / CONFIG_FILE with config_path.open("r") as file: config = yaml.safe_load(file) - match = re.match(r"v(\d+\.\d+)", config["concept_sets"]["version"]) + match = re.match(r"v(\d+\.\d+)", config["phenotype"]["version"]) major_version = match.group(1) # get latest minor version from git commit count @@ -772,7 +708,7 @@ def publish(phen_dir): next_minor_version = commit_count + 1 version = f"v{major_version}.{next_minor_version}" logger.debug(f"New version: {version}") - config["concept_sets"]["version"] = version + config["phenotype"]["version"] = version with open(config_path, "w") as file: yaml.dump(config, file, default_flow_style=False, sort_keys=False) @@ -826,8 +762,8 @@ def export(phen_dir, version): export_db_path = omop.export( map_path, export_path, - config["concept_sets"]["version"], - config["concept_sets"]["omop"], + config["phenotype"]["version"], + config["phenotype"]["omop"], ) # write to tables @@ -925,7 +861,7 @@ def diff(phen_dir, phen_old_dir): with new_config.open("r") as file: new_config = yaml.safe_load(file) report.write( - f"\n\n# Report for version {new_config['concept_sets']['version']}\n\n" + f"\n\n# Report for version {new_config['phenotype']['version']}\n\n" ) report.write(f"- Removed outputs: {list(removed_outputs)}\n") report.write(f"- Added outputs: {list(added_outputs)}\n") diff --git a/examples/config1.yaml b/examples/config1.yaml index f8c21bc050f5bd28970751d59a456b0815f16550..2f446aae11cb8c840c64422d15dc36a03ea1ec87 100644 --- a/examples/config1.yaml +++ b/examples/config1.yaml @@ -1,23 +1,14 @@ -concept_sets: +phenotype: version: "v1.0.1" omop: vocabulary_id: "ACMC_Example" vocabulary_name: "ACMC example phenotype" vocabulary_reference: "https://git.soton.ac.uk/meldb/concepts-processing/-/tree/main/examples" - concept_set: - - concept_set_name: "ABDO_PAIN" - metadata: {} - -codes: - - folder: "clinical-codes-org" - description: "Downloaded 16/11/23" - files: - - file: "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv" + concept_sets: + - name: "ABDO_PAIN" + file: + path: "clinical-codes-org/Symptom code lists/Abdominal pain/res176-abdominal-pain.csv" columns: read2: "code" - metadata: - - "description" - concept_set: - - name: "ABDO_PAIN" - + metadata: {} diff --git a/examples/config2.yaml b/examples/config2.yaml index ccf2839a3fde7362c213fa0022895a872c60e0ee..24acf969f13b58a777978b6fba258dc6e71471e6 100644 --- a/examples/config2.yaml +++ b/examples/config2.yaml @@ -1,29 +1,19 @@ -concept_sets: +phenotype: version: "v1.0.4" omop: vocabulary_id: "ACMC_Example" vocabulary_name: "ACMC example phenotype" vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example" - concept_set: - - concept_set_name: "CVD_EVENTS" - metadata: {} - - concept_set_name: "DID_NOT_ATTEND" - metadata: {} - -codes: - - folder: "clinical-codes-org" - description: "Downloaded 16/11/23" - files: - - file: "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv" + concept_sets: + - name: "CVD_EVENTS" + file: + path: "clinical-codes-org/Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv" columns: icd10: "code" - metadata: [] - concept_set: - - name: "CVD_EVENTS" - - file: "Non-attendance codes/res201-did-not-attend-appointment.csv" + metadata: {} + - name: "DID_NOT_ATTEND" + file: + path: "clinical-codes-org/Non-attendance codes/res201-did-not-attend-appointment.csv" columns: read2: "code" - metadata: [] - concept_set: - - name: "DID_NOT_ATTEND" - + metadata: {} \ No newline at end of file diff --git a/examples/config3.yaml b/examples/config3.yaml index 5ff517974fd5ecaee435e9956712a510c48df9b4..411606a1b0390dad52c5e14e1a23ed9490d06c0d 100644 --- a/examples/config3.yaml +++ b/examples/config3.yaml @@ -1,46 +1,38 @@ -concept_sets: +phenotype: version: "v1.0.4" omop: vocabulary_id: "ACMC_Example" vocabulary_name: "ACMC example phenotype" vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example" - concept_set: - - concept_set_name: "CVD_EVENTS" - metadata: {} - - concept_set_name: "DID_NOT_ATTEND" - metadata: {} - - concept_set_name: "HYPERTENSION" - metadata: {} - - concept_set_name: "DEPRESSION" - metadata: {} - -codes: - - folder: "clinical-codes-org" - description: "Downloaded 16/11/23" - files: - - file: "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv" + concept_sets: + - name: "CVD_EVENTS" + file: + path: "clinical-codes-org/Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv" columns: icd10: "code" - metadata: [] - concept_set: - - name: "CVD_EVENTS" - - file: "Non-attendance codes/res201-did-not-attend-appointment.csv" + metadata: {} + - name: "DID_NOT_ATTEND" + file: + path: "clinical-codes-org/Non-attendance codes/res201-did-not-attend-appointment.csv" columns: read2: "code" - metadata: [] - concept_set: - - name: "DID_NOT_ATTEND" - - folder: hanlon - description: Hanlon Paper Code Lists - files: - - file: Read_codes_for_diagnoses.csv + metadata: {} + - name: "HYPERTENSION" + file: + path: "hanlon/Read_codes_for_diagnoses.csv" columns: - read2: Read Code + read2: "Read Code" + category: "2" actions: - divide_col: MMCode - concept_set: - - name: HYPERTENSION - category: "2" - - name: DEPRESSION - category: "3" + divide_col: "MMCode" + metadata: {} + - name: "DEPRESSION" + file: + path: "hanlon/Read_codes_for_diagnoses.csv" + columns: + read2: "Read Code" + category: "3" + actions: + divide_col: "MMCode" + metadata: {}