From 3a7d715c39598095747e24a43ae8700c6754d2ba Mon Sep 17 00:00:00 2001 From: Michael Boniface <m.j.boniface@soton.ac.uk> Date: Mon, 24 Feb 2025 20:52:39 +0000 Subject: [PATCH] started the precommit hook work, but seems more complex as it requires some download from github etc with usernames and passwords #21 --- acmc/phen.py | 72 ++++++++++++++++++++++------------------------ pyproject.toml | 2 +- tests/test_acmc.py | 2 +- 3 files changed, 37 insertions(+), 39 deletions(-) diff --git a/acmc/phen.py b/acmc/phen.py index 3157b52..8669668 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -44,6 +44,7 @@ COL_ACTIONS = [SPLIT_COL_ACTION, CODES_COL_ACTION, DIVIDE_COL_ACTION] CODE_FILE_TYPES = [".xlsx", ".xls", ".csv"] + class PhenValidationException(Exception): """Custom exception class raised when validation errors in phenotype configuration file""" @@ -286,8 +287,8 @@ def validate(phen_dir): concept_set_names.append(item["name"]) # TODO: change this to some sort of yaml schema validation - required_keys = {"name", "file", "metadata"} - + required_keys = {"name", "file", "metadata"} + # check codes definition for item in phenotype["concept_sets"]: @@ -308,10 +309,12 @@ def validate(phen_dir): # check code file type is supported if concept_code_file_path.suffix not in CODE_FILE_TYPES: - raise ValueError(f"Unsupported filetype {concept_code_file_path.suffix}, only support csv, xlsx, xls code file types") - - # check columns specified are a supported medical coding type - for column in item["file"]["columns"]: + raise ValueError( + f"Unsupported filetype {concept_code_file_path.suffix}, only support csv, xlsx, xls code file types" + ) + + # check columns specified are a supported medical coding type + for column in item["file"]["columns"]: if column not in code_types: validation_errors.append( f"Column type {column} for file {concept_code_file_path} is not supported" @@ -321,9 +324,7 @@ def validate(phen_dir): if "actions" in item["file"]: for action in item["file"]["actions"]: if action not in COL_ACTIONS: - validation_errors.append( - f"Action {action} is not supported" - ) + validation_errors.append(f"Action {action} is not supported") else: validation_errors.append( @@ -356,7 +357,9 @@ def read_table_file(path, excel_sheet=None): elif path.suffix == ".dta": df = pd.read_stata(path, dtype=str) else: - raise ValueError(f"Unsupported filetype {codes_file_path.suffix}, only support{CODE_FILE_TYPES} code file types") + raise ValueError( + f"Unsupported filetype {codes_file_path.suffix}, only support{CODE_FILE_TYPES} code file types" + ) return df @@ -392,10 +395,13 @@ def preprocess_codes(df, concept_set, code_file_path, target_code_type=None): out = pd.DataFrame([]) # create output df to append to code_errors = [] # list of errors from processing - # TODO: Is there a better way of processing this action as it's distributed across + # TODO: Is there a better way of processing this action as it's distributed across # different parts of the programme. - if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"]: - divide_col_df = df[concept_set["file"]["actions"]["divide_col"]] + if ( + "actions" in concept_set["file"] + and "divide_col" in concept_set["file"]["actions"] + ): + divide_col_df = df[concept_set["file"]["actions"]["divide_col"]] else: divide_col_df = pd.DataFrame() @@ -421,10 +427,10 @@ def preprocess_codes(df, concept_set, code_file_path, target_code_type=None): [out, pd.DataFrame({code_type: codes}).join(divide_col_df)], ignore_index=True, ) - + return out, code_errors - - + + # Translate Df with multiple codes into single code type Series def translate_codes(df, target_code_type): codes = pd.Series([], dtype=str) @@ -578,33 +584,27 @@ def map(phen_dir, target_code_type): logger.debug(f"Length of errors from preprocess {len(errors)}") if len(errors) > 0: code_errors.extend(errors) - logger.debug(f" Length of code_errors {len(code_errors)}") - + logger.debug(f" Length of code_errors {len(code_errors)}") + # Map # if processing a source coding list with categorical data - if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"] and len(df) > 0: + if ( + "actions" in concept_set["file"] + and "divide_col" in concept_set["file"]["actions"] + and len(df) > 0 + ): divide_col = concept_set["file"]["actions"]["divide_col"] logger.debug(f"Action: Dividing Table by {divide_col}") logger.debug(f"column into: {df[divide_col].unique()}") df_grp = df.groupby(divide_col) - for cat, grp in df_grp: + for cat, grp in df_grp: if cat == concept_set["file"]["category"]: - grp = grp.drop( - columns=[divide_col] - ) # delete categorical column + grp = grp.drop(columns=[divide_col]) # delete categorical column out = map_file( - grp, - target_code_type, - out, - concept_name=concept_set['name'] + grp, target_code_type, out, concept_name=concept_set["name"] ) else: - out = map_file( - df, - target_code_type, - out, - concept_name=concept_set['name'] - ) + out = map_file(df, target_code_type, out, concept_name=concept_set["name"]) if len(code_errors) > 0: logger.error(f"The map processing has {len(code_errors)} errors") @@ -619,7 +619,7 @@ def map(phen_dir, target_code_type): raise Exception( f"No output after map processing, check config {str(config_path.resolve())}" ) - + # Final processing out = out.reset_index(drop=True) out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"]) @@ -847,9 +847,7 @@ def diff(phen_dir, phen_old_dir): new_config = new_phen_path / CONFIG_FILE with new_config.open("r") as file: new_config = yaml.safe_load(file) - report.write( - f"\n\n# Report for version {new_config['phenotype']['version']}\n\n" - ) + report.write(f"\n\n# Report for version {new_config['phenotype']['version']}\n\n") report.write(f"- Removed outputs: {list(removed_outputs)}\n") report.write(f"- Added outputs: {list(added_outputs)}\n") report.write(f"- Common outputs: {list(common_outputs)}\n") diff --git a/pyproject.toml b/pyproject.toml index 9516588..439a5d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ dependencies = [ [tool.hatch.envs.dev] dependencies = [ - "pydocstyle", + "pydocstyle", "pytest", "black", "mypy" diff --git a/tests/test_acmc.py b/tests/test_acmc.py index e70fc0a..43b8197 100644 --- a/tests/test_acmc.py +++ b/tests/test_acmc.py @@ -51,7 +51,7 @@ def test_phen_init_local_specified(tmp_dir, monkeypatch, caplog): [ ("config1.yaml"), # config.yaml test case ("config2.yaml"), # config.yaml test case - ("config3.yaml"), # config.yaml test case + ("config3.yaml"), # config.yaml test case ], ) def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file): -- GitLab