Merge branch '21-add-mypy-and-black-as-precommit-hook' into 'dev'

started the precommit hook work, but seems more complex as it requires some... Closes #21 See merge request meldb/concepts-processing!11

Merge branch '21-add-mypy-and-black-as-precommit-hook' into 'dev'
9d3f9fae · mjbonifa · cae3acc7 · 2e58ef45 · 9d3f9fae · 9d3f9fae
Commit 9d3f9fae authored 3 months ago by mjbonifa
--- a/acmc/phen.py
+++ b/acmc/phen.py
@@ -44,6 +44,7 @@ COL_ACTIONS = [SPLIT_COL_ACTION, CODES_COL_ACTION, DIVIDE_COL_ACTION]
 CODE_FILE_TYPES = [".xlsx", ".xls", ".csv"]
 class PhenValidationException(Exception):
    """Custom exception class raised when validation errors in phenotype configuration file"""
@@ -286,8 +287,8 @@ def validate(phen_dir):
            concept_set_names.append(item["name"])
    # TODO: change this to some sort of yaml schema validation
-    required_keys = {"name", "file", "metadata"}    
+    required_keys = {"name", "file", "metadata"}
    # check codes definition
    for item in phenotype["concept_sets"]:
@@ -308,10 +309,12 @@ def validate(phen_dir):
            # check code file type is supported
            if concept_code_file_path.suffix not in CODE_FILE_TYPES:
-                raise ValueError(f"Unsupported filetype {concept_code_file_path.suffix}, only support csv, xlsx, xls code file types")
+                raise ValueError(
+                    f"Unsupported filetype {concept_code_file_path.suffix}, only support csv, xlsx, xls code file types"
-            # check columns specified are a supported medical coding type        
+                )
-            for column in item["file"]["columns"]:                   
+            # check columns specified are a supported medical coding type
+            for column in item["file"]["columns"]:
                if column not in code_types:
                    validation_errors.append(
                        f"Column type {column} for file {concept_code_file_path} is not supported"
@@ -321,9 +324,7 @@ def validate(phen_dir):
            if "actions" in item["file"]:
                for action in item["file"]["actions"]:
                    if action not in COL_ACTIONS:
-                        validation_errors.append(
+                        validation_errors.append(f"Action {action} is not supported")
-                            f"Action {action} is not supported"
-                        )
        else:
            validation_errors.append(
@@ -356,7 +357,9 @@ def read_table_file(path, excel_sheet=None):
    elif path.suffix == ".dta":
        df = pd.read_stata(path, dtype=str)
    else:
-        raise ValueError(f"Unsupported filetype {codes_file_path.suffix}, only support{CODE_FILE_TYPES} code file types")   
+        raise ValueError(
+            f"Unsupported filetype {codes_file_path.suffix}, only support{CODE_FILE_TYPES} code file types"
+        )
    return df
@@ -392,10 +395,13 @@ def preprocess_codes(df, concept_set, code_file_path, target_code_type=None):
    out = pd.DataFrame([])  # create output df to append to
    code_errors = []  # list of errors from processing
-    # TODO: Is there a better way of processing this action as it's distributed across 
+    # TODO: Is there a better way of processing this action as it's distributed across
    # different parts of the programme.
-    if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"]:
+    if (
-        divide_col_df = df[concept_set["file"]["actions"]["divide_col"]]  
+        "actions" in concept_set["file"]
+        and "divide_col" in concept_set["file"]["actions"]
+    ):
+        divide_col_df = df[concept_set["file"]["actions"]["divide_col"]]
    else:
        divide_col_df = pd.DataFrame()
@@ -421,10 +427,10 @@ def preprocess_codes(df, concept_set, code_file_path, target_code_type=None):
            [out, pd.DataFrame({code_type: codes}).join(divide_col_df)],
            ignore_index=True,
        )
    return out, code_errors
 # Translate Df with multiple codes into single code type Series
 def translate_codes(df, target_code_type):
    codes = pd.Series([], dtype=str)
@@ -578,33 +584,27 @@ def map(phen_dir, target_code_type):
        logger.debug(f"Length of errors from preprocess {len(errors)}")
        if len(errors) > 0:
            code_errors.extend(errors)
-        logger.debug(f" Length of code_errors {len(code_errors)}")      
+        logger.debug(f" Length of code_errors {len(code_errors)}")
        # Map
        # if processing a source coding list with categorical data
-        if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"] and len(df) > 0:
+        if (
+            "actions" in concept_set["file"]
+            and "divide_col" in concept_set["file"]["actions"]
+            and len(df) > 0
+        ):
            divide_col = concept_set["file"]["actions"]["divide_col"]
            logger.debug(f"Action: Dividing Table by {divide_col}")
            logger.debug(f"column into: {df[divide_col].unique()}")
            df_grp = df.groupby(divide_col)
-            for cat, grp in df_grp:                 
+            for cat, grp in df_grp:
                if cat == concept_set["file"]["category"]:
-                    grp = grp.drop(
+                    grp = grp.drop(columns=[divide_col])  # delete categorical column
-                        columns=[divide_col]
-                    )  # delete categorical column
                    out = map_file(
-                        grp,
+                        grp, target_code_type, out, concept_name=concept_set["name"]
-                        target_code_type,
-                        out,
-                        concept_name=concept_set['name']
                    )
        else:
-            out = map_file(
+            out = map_file(df, target_code_type, out, concept_name=concept_set["name"])
-                df,
-                target_code_type,
-                out,
-                concept_name=concept_set['name']
-            )                             
    if len(code_errors) > 0:
        logger.error(f"The map processing has {len(code_errors)} errors")
@@ -619,7 +619,7 @@ def map(phen_dir, target_code_type):
        raise Exception(
            f"No output after map processing, check config {str(config_path.resolve())}"
        )
    # Final processing
    out = out.reset_index(drop=True)
    out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
@@ -847,9 +847,7 @@ def diff(phen_dir, phen_old_dir):
    new_config = new_phen_path / CONFIG_FILE
    with new_config.open("r") as file:
        new_config = yaml.safe_load(file)
-    report.write(
+    report.write(f"\n\n# Report for version {new_config['phenotype']['version']}\n\n")
-        f"\n\n# Report for version {new_config['phenotype']['version']}\n\n"
-    )
    report.write(f"- Removed outputs: {list(removed_outputs)}\n")
    report.write(f"- Added outputs: {list(added_outputs)}\n")
    report.write(f"- Common outputs: {list(common_outputs)}\n")

--- a/acmc/trud.py
+++ b/acmc/trud.py
@@ -6,7 +6,7 @@ import shutil
 import hashlib
 import zipfile
 import pandas as pd
-import simpledbf
+import simpledbf # type: ignore
 import yaml
 from pathlib import Path

--- a/docs/index.md
+++ b/docs/index.md
@@ -210,6 +210,12 @@ We have two separate environments to ensure that development dependencies (such
 - default environment: includes the core dependencies to run acmc (e.g., requests, etc.).
 - dev environment: includes additional tools for testing, code formatting, linting, and other development workflows (e.g., pytest, black, mypy, etc.).
+The development toos used include:
+- pytest: testing
+- mypy: type checking
+- black: code formatting
 ### Activate the Development Environment
 To enter the (dev) development environment, use:
@@ -231,33 +237,40 @@ To exit an environment from hatch, use:
 exit
 ```
+### Running Tests
+To run tests using `pytest`, use:
+```sh
+hatch run pytest
+```
+### All code checks
+The project run all type and formatting checking
+```sh
+hatch run check
+```
 ### Code Formatting
 The project uses `black` for code formatting. Ensure your code is properly formatted before committing
 To check if any of the code needs to be formatted, run black with the `--check` option
 ```sh
-hatch run black --check acmc
+hatch run black --check .
 ```
 To format the coode and modify the files, use
 ```sh
-hatch run black acmc
+hatch run black .
 ```
 ### Type Checking
 The project uses `mypy` for type checking:
 ```sh
-hatch run mypy -p acmc
+hatch run mypy .
-```
-### Running Tests
-To run tests using `pytest`, use:
-```sh
-hatch run pytest
 ```
 ## Building the Package

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,12 +62,15 @@ dependencies = [
 [tool.hatch.envs.dev]
 dependencies = [
-	"pydocstyle",	
+	"pydocstyle",
    "pytest",
    "black",
    "mypy"
 ]
+[tool.hatch.envs.default.scripts]
+check = "black . && mypy ."
 [tool.hatch.build]
 include = ["acmc/**"]  # Ensure only the acmc package is included
@@ -76,5 +79,5 @@ include = [
    "acmc/**",
 ]
-[tool.mypy]
-ignore_missing_imports = true
\ No newline at end of file
--- a/tests/test_acmc.py
+++ b/tests/test_acmc.py
@@ -8,7 +8,7 @@ from pathlib import Path
 from acmc import trud, omop, main, logging_config as lc
 # setup logging
-logger = lc.setup_logger()
+lc.setup_logger()
 @pytest.fixture
@@ -51,7 +51,7 @@ def test_phen_init_local_specified(tmp_dir, monkeypatch, caplog):
    [
        ("config1.yaml"),  # config.yaml test case
        ("config2.yaml"),  # config.yaml test case
-        ("config3.yaml"),  # config.yaml test case        
+        ("config3.yaml"),  # config.yaml test case
    ],
 )
 def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file):