diff --git a/acmc/phen.py b/acmc/phen.py index a068efe02734a27a84dc7069703739dd141c6067..0691a099ed44554c4af77378f5f9c15eb5f7c41f 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -235,34 +235,46 @@ def validate(phen_dir): # check codes definition concept_set_mapping_names = [] for item in concept_codes: - # check concept codes path is a directory - concept_code_dir_path = codes_path / item['folder'] - if not concept_code_dir_path.is_dir(): - validation_errors.append(f"Folder directory {str(concept_code_dir_path.resolve())} is not a directory") - - for file in item["files"]: - # check concepte code file exists - concept_code_file_path = concept_code_dir_path / file['file'] - if not concept_code_file_path.exists(): - validation_errors.append(f"Coding file {str(concept_code_file_path.resolve())} does not exist") - - # check columns specified are a supported medical coding type - for column in file['columns']: - if column not in code_types and column != 'metadata': - validation_errors.append(f"Column type {column} for file {concept_code_file_path} is not supported") - - # check the actions are supported - if 'actions' in file: - for action in file['actions']: - if action not in COL_ACTIONS: - validation_errors.append(f"Action {action} is not supported") - - # check concept_set defined for the mapping - for concept_set_mapping in file['concept_set']: - # store the concept set names found for later set operations - if concept_set_mapping not in concept_set_mapping_names: - concept_set_mapping_names.append(concept_set_mapping) - + required_keys = {"folder", "files"} + if required_keys.issubset(item.keys()): + # check concept codes path is a directory + concept_code_dir_path = codes_path / item['folder'] + if not concept_code_dir_path.is_dir(): + validation_errors.append(f"Folder directory {str(concept_code_dir_path.resolve())} is not a directory") + + for file in item["files"]: + # check concepte code file exists + concept_code_file_path = concept_code_dir_path / file['file'] + if not concept_code_file_path.exists(): + validation_errors.append(f"Coding file {str(concept_code_file_path.resolve())} does not exist") + + # check concepte code file is not empty + concept_code_file_path = concept_code_dir_path / file['file'] + if concept_code_file_path.stat().st_size == 0: + validation_errors.append(f"Coding file {str(concept_code_file_path.resolve())} is an empty file") + + # check columns section exists + if "columns" not in file: + validation_errors.append(f"Columns not defined for {concept_code_file_path}") + + # check columns specified are a supported medical coding type + for column in file['columns']: + if column not in code_types and column != 'metadata': + validation_errors.append(f"Column type {column} for file {concept_code_file_path} is not supported") + + # check the actions are supported + if 'actions' in file: + for action in file['actions']: + if action not in COL_ACTIONS: + validation_errors.append(f"Action {action} is not supported") + + # check concept_set defined for the mapping + for concept_set_mapping in file['concept_set']: + # store the concept set names found for later set operations + if concept_set_mapping not in concept_set_mapping_names: + concept_set_mapping_names.append(concept_set_mapping) + else: + validation_errors.append(f"Missing required elements {required_keys} in codes {item}") # create sets to perform set operations on the lists of concept set names concept_set_names_set = set(concept_set_names) concept_set_mapping_names_set = set(concept_set_mapping_names) @@ -302,22 +314,20 @@ def read_table_file(path, excel_sheet=None): return df -def preprocess_code(out, codes, codes_file, checker, output_col, df_meta, verify=True): - logger.debug(f" Preprocess_code") +def preprocess_code(out, codes, codes_file, checker, output_col, df_meta): codes = codes.astype(str) # convert to string codes = codes.str.strip() # remove excess spaces - logger.debug(f" CODE TYPE IN PREPROCESS {type(codes)}") - if verify: - codes, errors = checker.process(codes, codes_file) # resolve any identified issues - if len(errors) > 0: - raise Exception(f"Code validation failed with {len(errors)} errors") + + codes, errors = checker.process(codes, codes_file) # resolve any identified issues + if len(errors) > 0: + raise Exception(f"Code validation failed with {len(errors)} errors") # add metadata columns out = pd.concat([out, pd.DataFrame({output_col: codes}).join(df_meta)], ignore_index=True) return out # Perform QA Checks on columns individually and append to df -def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=None, verify=True, translate=True,): +def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=None, translate=True,): """ Parses each column individually - Order and length will not be preserved! """ out = pd.DataFrame([]) # create output df to append to logger.debug(f"CODES file {codes_file}") @@ -330,8 +340,7 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=N codes_file=codes_file, checker=code_types[target_code_type](file_path), output_col=target_code_type, - df_meta=df[meta_columns], - verify=verify,) + df_meta=df[meta_columns]) else: logger.warning(f"No {target_code_type} Codes to process") else: @@ -344,8 +353,7 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=N codes_file=codes_file, checker=v(), output_col=k, - df_meta=df[meta_columns], - verify=verify,) + df_meta=df[meta_columns]) return out # Translate Df with multiple codes into single code type Series @@ -408,11 +416,10 @@ def sql_row_exist(conn, table, column, value): return exists -def map(phen_dir, target_code_type, translate=True, verify=True): +def map(phen_dir, target_code_type, translate=True): logger.info(f"Processing phenotype directory: {phen_dir}") logger.debug(f"Target coding format: {target_code_type}") logger.debug(f"Translating: {translate}") - logger.debug(f"Verifying: {verify}") # Validate configuration validate(phen_dir) @@ -470,7 +477,6 @@ def map(phen_dir, target_code_type, translate=True, verify=True): meta_columns=meta_columns, codes_file=str(codes_file_path.resolve()), target_code_type=target_code_type, - verify=verify, translate=translate) else: raise Exception("No column format provided") diff --git a/tests/test_acmc.py b/tests/test_acmc.py index 81756246d99d8d6837f23da59d362f803d4c2667..6881cc33fa87e7f5fa637ba81c6f38dfbc968ed1 100644 --- a/tests/test_acmc.py +++ b/tests/test_acmc.py @@ -76,7 +76,7 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog): # map phenotype with caplog.at_level(logging.DEBUG): - monkeypatch.setattr(sys, "argv", ["main.py", "phen", "map", "-d", str(phen_path.resolve()), "-t", "read2", "-tr", "-ve"]) + monkeypatch.setattr(sys, "argv", ["main.py", "phen", "map", "-d", str(phen_path.resolve()), "-t", "read3", "-tr"]) main.main() assert "Phenotype processed successfully" in caplog.text