diff --git a/acmc/phen.py b/acmc/phen.py index 3157b52b7c671d5ced5f2a3d4c5141e12de2ba3d..866966863f61854779836ead5335e498cb3f3ba6 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -44,6 +44,7 @@ COL_ACTIONS = [SPLIT_COL_ACTION, CODES_COL_ACTION, DIVIDE_COL_ACTION] CODE_FILE_TYPES = [".xlsx", ".xls", ".csv"] + class PhenValidationException(Exception): """Custom exception class raised when validation errors in phenotype configuration file""" @@ -286,8 +287,8 @@ def validate(phen_dir): concept_set_names.append(item["name"]) # TODO: change this to some sort of yaml schema validation - required_keys = {"name", "file", "metadata"} - + required_keys = {"name", "file", "metadata"} + # check codes definition for item in phenotype["concept_sets"]: @@ -308,10 +309,12 @@ def validate(phen_dir): # check code file type is supported if concept_code_file_path.suffix not in CODE_FILE_TYPES: - raise ValueError(f"Unsupported filetype {concept_code_file_path.suffix}, only support csv, xlsx, xls code file types") - - # check columns specified are a supported medical coding type - for column in item["file"]["columns"]: + raise ValueError( + f"Unsupported filetype {concept_code_file_path.suffix}, only support csv, xlsx, xls code file types" + ) + + # check columns specified are a supported medical coding type + for column in item["file"]["columns"]: if column not in code_types: validation_errors.append( f"Column type {column} for file {concept_code_file_path} is not supported" @@ -321,9 +324,7 @@ def validate(phen_dir): if "actions" in item["file"]: for action in item["file"]["actions"]: if action not in COL_ACTIONS: - validation_errors.append( - f"Action {action} is not supported" - ) + validation_errors.append(f"Action {action} is not supported") else: validation_errors.append( @@ -356,7 +357,9 @@ def read_table_file(path, excel_sheet=None): elif path.suffix == ".dta": df = pd.read_stata(path, dtype=str) else: - raise ValueError(f"Unsupported filetype {codes_file_path.suffix}, only support{CODE_FILE_TYPES} code file types") + raise ValueError( + f"Unsupported filetype {codes_file_path.suffix}, only support{CODE_FILE_TYPES} code file types" + ) return df @@ -392,10 +395,13 @@ def preprocess_codes(df, concept_set, code_file_path, target_code_type=None): out = pd.DataFrame([]) # create output df to append to code_errors = [] # list of errors from processing - # TODO: Is there a better way of processing this action as it's distributed across + # TODO: Is there a better way of processing this action as it's distributed across # different parts of the programme. - if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"]: - divide_col_df = df[concept_set["file"]["actions"]["divide_col"]] + if ( + "actions" in concept_set["file"] + and "divide_col" in concept_set["file"]["actions"] + ): + divide_col_df = df[concept_set["file"]["actions"]["divide_col"]] else: divide_col_df = pd.DataFrame() @@ -421,10 +427,10 @@ def preprocess_codes(df, concept_set, code_file_path, target_code_type=None): [out, pd.DataFrame({code_type: codes}).join(divide_col_df)], ignore_index=True, ) - + return out, code_errors - - + + # Translate Df with multiple codes into single code type Series def translate_codes(df, target_code_type): codes = pd.Series([], dtype=str) @@ -578,33 +584,27 @@ def map(phen_dir, target_code_type): logger.debug(f"Length of errors from preprocess {len(errors)}") if len(errors) > 0: code_errors.extend(errors) - logger.debug(f" Length of code_errors {len(code_errors)}") - + logger.debug(f" Length of code_errors {len(code_errors)}") + # Map # if processing a source coding list with categorical data - if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"] and len(df) > 0: + if ( + "actions" in concept_set["file"] + and "divide_col" in concept_set["file"]["actions"] + and len(df) > 0 + ): divide_col = concept_set["file"]["actions"]["divide_col"] logger.debug(f"Action: Dividing Table by {divide_col}") logger.debug(f"column into: {df[divide_col].unique()}") df_grp = df.groupby(divide_col) - for cat, grp in df_grp: + for cat, grp in df_grp: if cat == concept_set["file"]["category"]: - grp = grp.drop( - columns=[divide_col] - ) # delete categorical column + grp = grp.drop(columns=[divide_col]) # delete categorical column out = map_file( - grp, - target_code_type, - out, - concept_name=concept_set['name'] + grp, target_code_type, out, concept_name=concept_set["name"] ) else: - out = map_file( - df, - target_code_type, - out, - concept_name=concept_set['name'] - ) + out = map_file(df, target_code_type, out, concept_name=concept_set["name"]) if len(code_errors) > 0: logger.error(f"The map processing has {len(code_errors)} errors") @@ -619,7 +619,7 @@ def map(phen_dir, target_code_type): raise Exception( f"No output after map processing, check config {str(config_path.resolve())}" ) - + # Final processing out = out.reset_index(drop=True) out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"]) @@ -847,9 +847,7 @@ def diff(phen_dir, phen_old_dir): new_config = new_phen_path / CONFIG_FILE with new_config.open("r") as file: new_config = yaml.safe_load(file) - report.write( - f"\n\n# Report for version {new_config['phenotype']['version']}\n\n" - ) + report.write(f"\n\n# Report for version {new_config['phenotype']['version']}\n\n") report.write(f"- Removed outputs: {list(removed_outputs)}\n") report.write(f"- Added outputs: {list(added_outputs)}\n") report.write(f"- Common outputs: {list(common_outputs)}\n") diff --git a/acmc/trud.py b/acmc/trud.py index 57a0afe4f8d9f2393011a3a5bc832ea67ba4e7dc..d8852cc5b424c20e7271bd6162e849671f5ba840 100644 --- a/acmc/trud.py +++ b/acmc/trud.py @@ -6,7 +6,7 @@ import shutil import hashlib import zipfile import pandas as pd -import simpledbf +import simpledbf # type: ignore import yaml from pathlib import Path diff --git a/docs/index.md b/docs/index.md index 07791b90d48d86862acf4fd0ba14ba9897535ead..85db380914afbfd25c9795dda9145392a03cad68 100644 --- a/docs/index.md +++ b/docs/index.md @@ -210,6 +210,12 @@ We have two separate environments to ensure that development dependencies (such - default environment: includes the core dependencies to run acmc (e.g., requests, etc.). - dev environment: includes additional tools for testing, code formatting, linting, and other development workflows (e.g., pytest, black, mypy, etc.). +The development toos used include: + +- pytest: testing +- mypy: type checking +- black: code formatting + ### Activate the Development Environment To enter the (dev) development environment, use: @@ -231,33 +237,40 @@ To exit an environment from hatch, use: exit ``` +### Running Tests +To run tests using `pytest`, use: + +```sh +hatch run pytest +``` + +### All code checks +The project run all type and formatting checking + +```sh +hatch run check +``` + ### Code Formatting The project uses `black` for code formatting. Ensure your code is properly formatted before committing To check if any of the code needs to be formatted, run black with the `--check` option ```sh -hatch run black --check acmc +hatch run black --check . ``` To format the coode and modify the files, use ```sh -hatch run black acmc +hatch run black . ``` ### Type Checking The project uses `mypy` for type checking: ```sh -hatch run mypy -p acmc -``` - -### Running Tests -To run tests using `pytest`, use: - -```sh -hatch run pytest +hatch run mypy . ``` ## Building the Package diff --git a/pyproject.toml b/pyproject.toml index 951658810ee1ac97c4c31dbfb1efb8155cae9a52..4e861658f7a7638d8aad78304f561f217b88eb3a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,12 +62,15 @@ dependencies = [ [tool.hatch.envs.dev] dependencies = [ - "pydocstyle", + "pydocstyle", "pytest", "black", "mypy" ] +[tool.hatch.envs.default.scripts] +check = "black . && mypy ." + [tool.hatch.build] include = ["acmc/**"] # Ensure only the acmc package is included @@ -76,5 +79,5 @@ include = [ "acmc/**", ] -[tool.mypy] -ignore_missing_imports = true \ No newline at end of file + + diff --git a/tests/test_acmc.py b/tests/test_acmc.py index e70fc0aef14e000f8570c7f91a0a3fb2feb695b8..0ad6f863f595dbe9c78acc368ff4a0bd7307648f 100644 --- a/tests/test_acmc.py +++ b/tests/test_acmc.py @@ -8,7 +8,7 @@ from pathlib import Path from acmc import trud, omop, main, logging_config as lc # setup logging -logger = lc.setup_logger() +lc.setup_logger() @pytest.fixture @@ -51,7 +51,7 @@ def test_phen_init_local_specified(tmp_dir, monkeypatch, caplog): [ ("config1.yaml"), # config.yaml test case ("config2.yaml"), # config.yaml test case - ("config3.yaml"), # config.yaml test case + ("config3.yaml"), # config.yaml test case ], ) def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file):