diff --git a/acmc/phen.py b/acmc/phen.py index 0691a099ed44554c4af77378f5f9c15eb5f7c41f..282186c31a1c51802f9831772f43d2e23cc12765 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -15,7 +15,7 @@ from urllib.parse import urlparse, urlunparse # acmc imports from acmc import trud, omop -from acmc.parse import Read2, Read3, Icd10, Snomed, Opcs4, Atc, code_types, vocab_types +from acmc.parse import code_types from acmc.omop import publish_concept_sets, setup # setup logging @@ -33,7 +33,6 @@ CONCEPT_SET_DIR = 'concept-set' DEFAULT_PHEN_DIR_LIST = [CODES_DIR, MAP_DIR, CONCEPT_SET_DIR] CONFIG_FILE = 'config.json' -ERROR_FILE = 'errors.csv' REPORT_FILE = 'report.md' DEFAULT_GIT_BRANCH = 'main' @@ -362,10 +361,8 @@ def convert_codes(df, target, translate): # Append target column (if exists) - doesn't need conversion if target in df.columns: - logger.debug("Has", len(df), target, "in file") + logger.debug(f"Has {len(df)} {target} in file") codes = pd.concat([codes, df[target]]) - # else: - # logger.debug("No",target,"in file") if translate: # Convert codes to target type @@ -439,58 +436,53 @@ def map(phen_dir, target_code_type, translate=True): # Process each folder in codes section for folder in codes: - logger.debug(folder["description"]) - if "files" in folder: - for file in folder["files"]: - logger.debug(f"--- {file["file"]} ---") - codes_file_path = codes_path / folder["folder"] / file["file"] - - # Load Code File - if "excel_sheet" in file: - df = read_table_file(path=codes_file_path, excel_sheet=file["excel_sheet"]) - else: - df = read_table_file(path=codes_file_path) - - # Perform Structural Changes to file before preprocessing - # split column with multiple code types - logger.debug("Processing actions") - if ("actions" in file and "split_col" in file["actions"] and "codes_col" in file["actions"]): - split_col = file["actions"]["split_col"] - codes_col = file["actions"]["codes_col"] - logger.debug("Action: Splitting", split_col, "column into:", df[split_col].unique(),) - codes = df[codes_col] - oh = pd.get_dummies(df[split_col], dtype=bool) # one hot encode - oh = oh.where((oh != True), codes, axis=0) # fill in 1s with codes - oh[oh == False] = np.nan # replace 0s with None - df = pd.concat([df, oh], axis=1) # merge in new columns - - # Preprocessing & Validation Checks - if "columns" in file: - meta_columns = [] # meta columns to keep with codes - if "actions" in file and "divide_col" in file["actions"]: - meta_columns += [file["actions"]["divide_col"]] - # TODO: enable metacolumns to be outputted - problem with map_file appending - if "metadata" in file["columns"]: - meta_columns += file["columns"]["metadata"] - df = preprocess(df, - file["columns"], - meta_columns=meta_columns, - codes_file=str(codes_file_path.resolve()), - target_code_type=target_code_type, - translate=translate) - else: - raise Exception("No column format provided") - - # partition table by categorical column - if ("actions" in file and "divide_col" in file["actions"] and len(df) > 0): - divide_col = file["actions"]["divide_col"] - logger.debug("Action: Dividing Table by", divide_col, "column into: ", df[divide_col].unique(),) - df = df.groupby(divide_col) - - # Map to Concept/Phenotype - if len(df) == 0: - pass - elif ("concept_set" in file) and isinstance(df, pd.core.frame.DataFrame): + for file in folder["files"]: + logger.debug(f"--- {file["file"]} ---") + codes_file_path = codes_path / folder["folder"] / file["file"] + + # Load Code File + if "excel_sheet" in file: + df = read_table_file(path=codes_file_path, excel_sheet=file["excel_sheet"]) + else: + df = read_table_file(path=codes_file_path) + + # Perform Structural Changes to file before preprocessing + # split column with multiple code types + logger.debug("Processing file structural actions") + if ("actions" in file and "split_col" in file["actions"] and "codes_col" in file["actions"]): + split_col = file["actions"]["split_col"] + codes_col = file["actions"]["codes_col"] + logger.debug("Action: Splitting", split_col, "column into:", df[split_col].unique(),) + codes = df[codes_col] + oh = pd.get_dummies(df[split_col], dtype=bool) # one hot encode + oh = oh.where((oh != True), codes, axis=0) # fill in 1s with codes + oh[oh == False] = np.nan # replace 0s with None + df = pd.concat([df, oh], axis=1) # merge in new columns + + # Preprocessing & Validation Checks + logger.debug("Processing and validating code formats") + meta_columns = [] # meta columns to keep with codes + if "actions" in file and "divide_col" in file["actions"]: + meta_columns += [file["actions"]["divide_col"]] + # TODO: enable metacolumns to be outputted - problem with map_file appending + if "metadata" in file["columns"]: + meta_columns += file["columns"]["metadata"] + df = preprocess(df, + file["columns"], + meta_columns=meta_columns, + codes_file=str(codes_file_path.resolve()), + target_code_type=target_code_type, + translate=translate) + + # partition table by categorical column + if ("actions" in file and "divide_col" in file["actions"] and len(df) > 0): + divide_col = file["actions"]["divide_col"] + logger.debug("Action: Dividing Table by", divide_col, "column into: ", df[divide_col].unique(),) + df = df.groupby(divide_col) + + # Map to Concept/Phenotype + if len(df.index) != 0: + if ("concept_set" in file) and isinstance(df, pd.core.frame.DataFrame): out = map_file(df, target_code_type, out, concepts=file["concept_set"], @@ -508,12 +500,9 @@ def map(phen_dir, target_code_type, translate=True): concepts=file["concept_set_categories"][cat], meta_columns=meta_columns,) - else: - logger.warning("Folder is empty") - # test if there's any output from processing - if len(out) <= 0: - raise Exception("Processing has not produced any output") + if len(out.index) == 0: + raise Exception("The output after map processing has no output, check configuration file {str(config_path.resolve())} is not empty") # Final processing out = out.reset_index(drop=True) @@ -564,14 +553,6 @@ def map(phen_dir, target_code_type, translate=True): # copy version files used for mapping to repo shutil.copy(trud.VERSION_PATH, phen_path / trud.VERSION_FILE) shutil.copy(omop.VERSION_PATH, phen_path / omop.VERSION_FILE) - - # write erros to a file - error_path = phen_path / ERROR_FILE - if error_path.exists(): - error_df = pd.read_csv(error_path) - error_df = error_df.drop_duplicates() # Remove Duplicates from Error file - error_df = error_df.sort_values(by=["SOURCE", "VOCABULARY", "CONCEPT"]) - error_df.to_csv(error_path, index=False) logger.debug(f"Saved concept_sets to {str(concept_set_path.resolve())}") diff --git a/tests/test_acmc.py b/tests/test_acmc.py index 6881cc33fa87e7f5fa637ba81c6f38dfbc968ed1..ee92004ce03c3518e17bb71c30dee6b30eb14c3b 100644 --- a/tests/test_acmc.py +++ b/tests/test_acmc.py @@ -29,14 +29,6 @@ def logger(): stream_handler = logging.StreamHandler(sys.stdout) logger.addHandler(stream_handler) -def test_phen_init_local_default(tmp_dir, monkeypatch, caplog): - with caplog.at_level(logging.DEBUG): - monkeypatch.setattr(sys, "argv", ["main.py", "phen", "init"]) - # Mock input() to return "yes" to the question about reinitialising the directory - monkeypatch.setattr("builtins.input", lambda _: "y") - main.main() - assert "Phenotype initialised successfully" in caplog.text - def test_phen_init_local_specified(tmp_dir, monkeypatch, caplog): with caplog.at_level(logging.DEBUG): phen_path = tmp_dir / "phen"