Skip to content
Snippets Groups Projects
Commit 9d3f9fae authored by mjbonifa's avatar mjbonifa
Browse files

Merge branch '21-add-mypy-and-black-as-precommit-hook' into 'dev'

started the precommit hook work, but seems more complex as it requires some...

Closes #21

See merge request meldb/concepts-processing!11
parents cae3acc7 2e58ef45
No related branches found
No related tags found
No related merge requests found
...@@ -44,6 +44,7 @@ COL_ACTIONS = [SPLIT_COL_ACTION, CODES_COL_ACTION, DIVIDE_COL_ACTION] ...@@ -44,6 +44,7 @@ COL_ACTIONS = [SPLIT_COL_ACTION, CODES_COL_ACTION, DIVIDE_COL_ACTION]
CODE_FILE_TYPES = [".xlsx", ".xls", ".csv"] CODE_FILE_TYPES = [".xlsx", ".xls", ".csv"]
class PhenValidationException(Exception): class PhenValidationException(Exception):
"""Custom exception class raised when validation errors in phenotype configuration file""" """Custom exception class raised when validation errors in phenotype configuration file"""
...@@ -286,8 +287,8 @@ def validate(phen_dir): ...@@ -286,8 +287,8 @@ def validate(phen_dir):
concept_set_names.append(item["name"]) concept_set_names.append(item["name"])
# TODO: change this to some sort of yaml schema validation # TODO: change this to some sort of yaml schema validation
required_keys = {"name", "file", "metadata"} required_keys = {"name", "file", "metadata"}
# check codes definition # check codes definition
for item in phenotype["concept_sets"]: for item in phenotype["concept_sets"]:
...@@ -308,10 +309,12 @@ def validate(phen_dir): ...@@ -308,10 +309,12 @@ def validate(phen_dir):
# check code file type is supported # check code file type is supported
if concept_code_file_path.suffix not in CODE_FILE_TYPES: if concept_code_file_path.suffix not in CODE_FILE_TYPES:
raise ValueError(f"Unsupported filetype {concept_code_file_path.suffix}, only support csv, xlsx, xls code file types") raise ValueError(
f"Unsupported filetype {concept_code_file_path.suffix}, only support csv, xlsx, xls code file types"
# check columns specified are a supported medical coding type )
for column in item["file"]["columns"]:
# check columns specified are a supported medical coding type
for column in item["file"]["columns"]:
if column not in code_types: if column not in code_types:
validation_errors.append( validation_errors.append(
f"Column type {column} for file {concept_code_file_path} is not supported" f"Column type {column} for file {concept_code_file_path} is not supported"
...@@ -321,9 +324,7 @@ def validate(phen_dir): ...@@ -321,9 +324,7 @@ def validate(phen_dir):
if "actions" in item["file"]: if "actions" in item["file"]:
for action in item["file"]["actions"]: for action in item["file"]["actions"]:
if action not in COL_ACTIONS: if action not in COL_ACTIONS:
validation_errors.append( validation_errors.append(f"Action {action} is not supported")
f"Action {action} is not supported"
)
else: else:
validation_errors.append( validation_errors.append(
...@@ -356,7 +357,9 @@ def read_table_file(path, excel_sheet=None): ...@@ -356,7 +357,9 @@ def read_table_file(path, excel_sheet=None):
elif path.suffix == ".dta": elif path.suffix == ".dta":
df = pd.read_stata(path, dtype=str) df = pd.read_stata(path, dtype=str)
else: else:
raise ValueError(f"Unsupported filetype {codes_file_path.suffix}, only support{CODE_FILE_TYPES} code file types") raise ValueError(
f"Unsupported filetype {codes_file_path.suffix}, only support{CODE_FILE_TYPES} code file types"
)
return df return df
...@@ -392,10 +395,13 @@ def preprocess_codes(df, concept_set, code_file_path, target_code_type=None): ...@@ -392,10 +395,13 @@ def preprocess_codes(df, concept_set, code_file_path, target_code_type=None):
out = pd.DataFrame([]) # create output df to append to out = pd.DataFrame([]) # create output df to append to
code_errors = [] # list of errors from processing code_errors = [] # list of errors from processing
# TODO: Is there a better way of processing this action as it's distributed across # TODO: Is there a better way of processing this action as it's distributed across
# different parts of the programme. # different parts of the programme.
if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"]: if (
divide_col_df = df[concept_set["file"]["actions"]["divide_col"]] "actions" in concept_set["file"]
and "divide_col" in concept_set["file"]["actions"]
):
divide_col_df = df[concept_set["file"]["actions"]["divide_col"]]
else: else:
divide_col_df = pd.DataFrame() divide_col_df = pd.DataFrame()
...@@ -421,10 +427,10 @@ def preprocess_codes(df, concept_set, code_file_path, target_code_type=None): ...@@ -421,10 +427,10 @@ def preprocess_codes(df, concept_set, code_file_path, target_code_type=None):
[out, pd.DataFrame({code_type: codes}).join(divide_col_df)], [out, pd.DataFrame({code_type: codes}).join(divide_col_df)],
ignore_index=True, ignore_index=True,
) )
return out, code_errors return out, code_errors
# Translate Df with multiple codes into single code type Series # Translate Df with multiple codes into single code type Series
def translate_codes(df, target_code_type): def translate_codes(df, target_code_type):
codes = pd.Series([], dtype=str) codes = pd.Series([], dtype=str)
...@@ -578,33 +584,27 @@ def map(phen_dir, target_code_type): ...@@ -578,33 +584,27 @@ def map(phen_dir, target_code_type):
logger.debug(f"Length of errors from preprocess {len(errors)}") logger.debug(f"Length of errors from preprocess {len(errors)}")
if len(errors) > 0: if len(errors) > 0:
code_errors.extend(errors) code_errors.extend(errors)
logger.debug(f" Length of code_errors {len(code_errors)}") logger.debug(f" Length of code_errors {len(code_errors)}")
# Map # Map
# if processing a source coding list with categorical data # if processing a source coding list with categorical data
if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"] and len(df) > 0: if (
"actions" in concept_set["file"]
and "divide_col" in concept_set["file"]["actions"]
and len(df) > 0
):
divide_col = concept_set["file"]["actions"]["divide_col"] divide_col = concept_set["file"]["actions"]["divide_col"]
logger.debug(f"Action: Dividing Table by {divide_col}") logger.debug(f"Action: Dividing Table by {divide_col}")
logger.debug(f"column into: {df[divide_col].unique()}") logger.debug(f"column into: {df[divide_col].unique()}")
df_grp = df.groupby(divide_col) df_grp = df.groupby(divide_col)
for cat, grp in df_grp: for cat, grp in df_grp:
if cat == concept_set["file"]["category"]: if cat == concept_set["file"]["category"]:
grp = grp.drop( grp = grp.drop(columns=[divide_col]) # delete categorical column
columns=[divide_col]
) # delete categorical column
out = map_file( out = map_file(
grp, grp, target_code_type, out, concept_name=concept_set["name"]
target_code_type,
out,
concept_name=concept_set['name']
) )
else: else:
out = map_file( out = map_file(df, target_code_type, out, concept_name=concept_set["name"])
df,
target_code_type,
out,
concept_name=concept_set['name']
)
if len(code_errors) > 0: if len(code_errors) > 0:
logger.error(f"The map processing has {len(code_errors)} errors") logger.error(f"The map processing has {len(code_errors)} errors")
...@@ -619,7 +619,7 @@ def map(phen_dir, target_code_type): ...@@ -619,7 +619,7 @@ def map(phen_dir, target_code_type):
raise Exception( raise Exception(
f"No output after map processing, check config {str(config_path.resolve())}" f"No output after map processing, check config {str(config_path.resolve())}"
) )
# Final processing # Final processing
out = out.reset_index(drop=True) out = out.reset_index(drop=True)
out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"]) out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
...@@ -847,9 +847,7 @@ def diff(phen_dir, phen_old_dir): ...@@ -847,9 +847,7 @@ def diff(phen_dir, phen_old_dir):
new_config = new_phen_path / CONFIG_FILE new_config = new_phen_path / CONFIG_FILE
with new_config.open("r") as file: with new_config.open("r") as file:
new_config = yaml.safe_load(file) new_config = yaml.safe_load(file)
report.write( report.write(f"\n\n# Report for version {new_config['phenotype']['version']}\n\n")
f"\n\n# Report for version {new_config['phenotype']['version']}\n\n"
)
report.write(f"- Removed outputs: {list(removed_outputs)}\n") report.write(f"- Removed outputs: {list(removed_outputs)}\n")
report.write(f"- Added outputs: {list(added_outputs)}\n") report.write(f"- Added outputs: {list(added_outputs)}\n")
report.write(f"- Common outputs: {list(common_outputs)}\n") report.write(f"- Common outputs: {list(common_outputs)}\n")
......
...@@ -6,7 +6,7 @@ import shutil ...@@ -6,7 +6,7 @@ import shutil
import hashlib import hashlib
import zipfile import zipfile
import pandas as pd import pandas as pd
import simpledbf import simpledbf # type: ignore
import yaml import yaml
from pathlib import Path from pathlib import Path
......
...@@ -210,6 +210,12 @@ We have two separate environments to ensure that development dependencies (such ...@@ -210,6 +210,12 @@ We have two separate environments to ensure that development dependencies (such
- default environment: includes the core dependencies to run acmc (e.g., requests, etc.). - default environment: includes the core dependencies to run acmc (e.g., requests, etc.).
- dev environment: includes additional tools for testing, code formatting, linting, and other development workflows (e.g., pytest, black, mypy, etc.). - dev environment: includes additional tools for testing, code formatting, linting, and other development workflows (e.g., pytest, black, mypy, etc.).
The development toos used include:
- pytest: testing
- mypy: type checking
- black: code formatting
### Activate the Development Environment ### Activate the Development Environment
To enter the (dev) development environment, use: To enter the (dev) development environment, use:
...@@ -231,33 +237,40 @@ To exit an environment from hatch, use: ...@@ -231,33 +237,40 @@ To exit an environment from hatch, use:
exit exit
``` ```
### Running Tests
To run tests using `pytest`, use:
```sh
hatch run pytest
```
### All code checks
The project run all type and formatting checking
```sh
hatch run check
```
### Code Formatting ### Code Formatting
The project uses `black` for code formatting. Ensure your code is properly formatted before committing The project uses `black` for code formatting. Ensure your code is properly formatted before committing
To check if any of the code needs to be formatted, run black with the `--check` option To check if any of the code needs to be formatted, run black with the `--check` option
```sh ```sh
hatch run black --check acmc hatch run black --check .
``` ```
To format the coode and modify the files, use To format the coode and modify the files, use
```sh ```sh
hatch run black acmc hatch run black .
``` ```
### Type Checking ### Type Checking
The project uses `mypy` for type checking: The project uses `mypy` for type checking:
```sh ```sh
hatch run mypy -p acmc hatch run mypy .
```
### Running Tests
To run tests using `pytest`, use:
```sh
hatch run pytest
``` ```
## Building the Package ## Building the Package
......
...@@ -62,12 +62,15 @@ dependencies = [ ...@@ -62,12 +62,15 @@ dependencies = [
[tool.hatch.envs.dev] [tool.hatch.envs.dev]
dependencies = [ dependencies = [
"pydocstyle", "pydocstyle",
"pytest", "pytest",
"black", "black",
"mypy" "mypy"
] ]
[tool.hatch.envs.default.scripts]
check = "black . && mypy ."
[tool.hatch.build] [tool.hatch.build]
include = ["acmc/**"] # Ensure only the acmc package is included include = ["acmc/**"] # Ensure only the acmc package is included
...@@ -76,5 +79,5 @@ include = [ ...@@ -76,5 +79,5 @@ include = [
"acmc/**", "acmc/**",
] ]
[tool.mypy]
ignore_missing_imports = true
\ No newline at end of file
...@@ -8,7 +8,7 @@ from pathlib import Path ...@@ -8,7 +8,7 @@ from pathlib import Path
from acmc import trud, omop, main, logging_config as lc from acmc import trud, omop, main, logging_config as lc
# setup logging # setup logging
logger = lc.setup_logger() lc.setup_logger()
@pytest.fixture @pytest.fixture
...@@ -51,7 +51,7 @@ def test_phen_init_local_specified(tmp_dir, monkeypatch, caplog): ...@@ -51,7 +51,7 @@ def test_phen_init_local_specified(tmp_dir, monkeypatch, caplog):
[ [
("config1.yaml"), # config.yaml test case ("config1.yaml"), # config.yaml test case
("config2.yaml"), # config.yaml test case ("config2.yaml"), # config.yaml test case
("config3.yaml"), # config.yaml test case ("config3.yaml"), # config.yaml test case
], ],
) )
def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file): def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment