diff --git a/acmc/phen.py b/acmc/phen.py index 532ecab6f62ac259e2c4624d2bbfc217e4907507..25ab3d448e88cb4123241ef3df2ead2dad9845dc 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -350,24 +350,8 @@ def log_invalid_code(codes, mask, code_type=None, file_path=None, cause=None): return codes[mask] -def preprocess_code(out, codes, codes_file, checker, output_col, metadata_df): - - # preprocess codes - codes = codes.astype(str) # convert to string - codes = codes.str.strip() # remove excess spaces - - codes, errors = checker.process(codes, codes_file) - if len(errors) > 0: - raise Exception(f"Code validation failed with {len(errors)} errors") - - - # add metadata columns - out = pd.concat([out, pd.DataFrame({output_col: codes}).join(metadata_df)], ignore_index=True) - - return out - # Perform QA Checks on columns individually and append to df -def preprocess(df, file, target_code_type=None, codes_file=None): +def preprocess_codes(df, file, target_code_type=None, codes_file=None): """ Parses each column individually - Order and length will not be preserved! """ out = pd.DataFrame([]) # create output df to append to @@ -377,31 +361,26 @@ def preprocess(df, file, target_code_type=None, codes_file=None): # TODO: enable metacolumns to be outputted - problem with map_file appending if "metadata" in file["columns"]: meta_columns += file["columns"]["metadata"] + + metadata_df = df[meta_columns] -# if target_code_type and not translate: -# # QA only on target codes -# if target_code_type in file['columns']: -# logger.info(f"Processing {target_code_type} Codes...") -# out = preprocess_code(out=out, -# codes=df[file['columns'][target_code_type]].dropna(), -# codes_file=codes_file, -# checker=parse.code_types[target_code_type], -# output_col=target_code_type, -# metadata_df=df[meta_columns]) -# else: -# logger.warning(f"No {target_code_type} Codes to process") -# else: - - # QA for every code type in df run preprocess_code() - for code_type_name, code_type in parse.code_types.items(): + # Preprocess codes + for code_type_name, code_type_parser in parse.code_types.items(): if code_type_name in file['columns']: - logger.info(f"Processing {code_type_name} Codes...") - out = preprocess_code(out=out, - codes=df[file['columns'][code_type_name]].dropna(), - codes_file=codes_file, - checker=code_type, - output_col=code_type_name, - metadata_df=df[meta_columns]) + logger.info(f"Processing {code_type_name} codes...") + + # get code types + codes = df[file['columns'][code_type_name]].dropna() + codes = codes.astype(str) # convert to string + codes = codes.str.strip() # remove excess spaces + + # process codes, validating them using parser and returning the errors + codes, errors = code_type_parser.process(codes, codes_file, ignore_errors=True) + if len(errors) > 0: + raise Exception(f"Code validation failed with {len(errors)} errors") + + # add metadata columns + out = pd.concat([out, pd.DataFrame({code_type_name: codes}).join(metadata_df)], ignore_index=True) return out, meta_columns @@ -501,10 +480,10 @@ def map(phen_dir, target_code_type): # Preprocessing & Validation Checks logger.debug("Processing and validating code formats") - df, meta_columns = preprocess(df, - file, - codes_file=str(codes_file_path.resolve()), - target_code_type=target_code_type) + df, meta_columns = preprocess_codes(df, + file, + codes_file=str(codes_file_path.resolve()), + target_code_type=target_code_type) # partition table by categorical column if ("actions" in file and "divide_col" in file["actions"] and len(df) > 0): diff --git a/tests/test_acmc.py b/tests/test_acmc.py index 874d4d5cd8cf9a71c74625b254ece4301dc6cc40..c02563f55a430523619cc808e767f3497bc3bf46 100644 --- a/tests/test_acmc.py +++ b/tests/test_acmc.py @@ -67,10 +67,11 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog): assert "Phenotype validated successfully" in caplog.text # map phenotype - with caplog.at_level(logging.DEBUG): - monkeypatch.setattr(sys, "argv", ["main.py", "phen", "map", "-d", str(phen_path.resolve()), "-t", "read3"]) - main.main() - assert "Phenotype processed successfully" in caplog.text + for code_type in ["read2", "read3", "snomed"]: + with caplog.at_level(logging.DEBUG): + monkeypatch.setattr(sys, "argv", ["main.py", "phen", "map", "-d", str(phen_path.resolve()), "-t", code_type]) + main.main() + assert "Phenotype processed successfully" in caplog.text # publish phenotype with caplog.at_level(logging.DEBUG):