From 3a7d715c39598095747e24a43ae8700c6754d2ba Mon Sep 17 00:00:00 2001
From: Michael Boniface <m.j.boniface@soton.ac.uk>
Date: Mon, 24 Feb 2025 20:52:39 +0000
Subject: [PATCH] started the precommit hook work, but seems more complex as it
 requires some download from github etc with usernames and passwords #21

---
 acmc/phen.py       | 72 ++++++++++++++++++++++------------------------
 pyproject.toml     |  2 +-
 tests/test_acmc.py |  2 +-
 3 files changed, 37 insertions(+), 39 deletions(-)

diff --git a/acmc/phen.py b/acmc/phen.py
index 3157b52..8669668 100644
--- a/acmc/phen.py
+++ b/acmc/phen.py
@@ -44,6 +44,7 @@ COL_ACTIONS = [SPLIT_COL_ACTION, CODES_COL_ACTION, DIVIDE_COL_ACTION]
 
 CODE_FILE_TYPES = [".xlsx", ".xls", ".csv"]
 
+
 class PhenValidationException(Exception):
     """Custom exception class raised when validation errors in phenotype configuration file"""
 
@@ -286,8 +287,8 @@ def validate(phen_dir):
             concept_set_names.append(item["name"])
 
     # TODO: change this to some sort of yaml schema validation
-    required_keys = {"name", "file", "metadata"}    
-    
+    required_keys = {"name", "file", "metadata"}
+
     # check codes definition
     for item in phenotype["concept_sets"]:
 
@@ -308,10 +309,12 @@ def validate(phen_dir):
 
             # check code file type is supported
             if concept_code_file_path.suffix not in CODE_FILE_TYPES:
-                raise ValueError(f"Unsupported filetype {concept_code_file_path.suffix}, only support csv, xlsx, xls code file types")
-            
-            # check columns specified are a supported medical coding type        
-            for column in item["file"]["columns"]:                   
+                raise ValueError(
+                    f"Unsupported filetype {concept_code_file_path.suffix}, only support csv, xlsx, xls code file types"
+                )
+
+            # check columns specified are a supported medical coding type
+            for column in item["file"]["columns"]:
                 if column not in code_types:
                     validation_errors.append(
                         f"Column type {column} for file {concept_code_file_path} is not supported"
@@ -321,9 +324,7 @@ def validate(phen_dir):
             if "actions" in item["file"]:
                 for action in item["file"]["actions"]:
                     if action not in COL_ACTIONS:
-                        validation_errors.append(
-                            f"Action {action} is not supported"
-                        )
+                        validation_errors.append(f"Action {action} is not supported")
 
         else:
             validation_errors.append(
@@ -356,7 +357,9 @@ def read_table_file(path, excel_sheet=None):
     elif path.suffix == ".dta":
         df = pd.read_stata(path, dtype=str)
     else:
-        raise ValueError(f"Unsupported filetype {codes_file_path.suffix}, only support{CODE_FILE_TYPES} code file types")   
+        raise ValueError(
+            f"Unsupported filetype {codes_file_path.suffix}, only support{CODE_FILE_TYPES} code file types"
+        )
 
     return df
 
@@ -392,10 +395,13 @@ def preprocess_codes(df, concept_set, code_file_path, target_code_type=None):
     out = pd.DataFrame([])  # create output df to append to
     code_errors = []  # list of errors from processing
 
-    # TODO: Is there a better way of processing this action as it's distributed across 
+    # TODO: Is there a better way of processing this action as it's distributed across
     # different parts of the programme.
-    if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"]:
-        divide_col_df = df[concept_set["file"]["actions"]["divide_col"]]  
+    if (
+        "actions" in concept_set["file"]
+        and "divide_col" in concept_set["file"]["actions"]
+    ):
+        divide_col_df = df[concept_set["file"]["actions"]["divide_col"]]
     else:
         divide_col_df = pd.DataFrame()
 
@@ -421,10 +427,10 @@ def preprocess_codes(df, concept_set, code_file_path, target_code_type=None):
             [out, pd.DataFrame({code_type: codes}).join(divide_col_df)],
             ignore_index=True,
         )
-        
+
     return out, code_errors
-    
-    
+
+
 # Translate Df with multiple codes into single code type Series
 def translate_codes(df, target_code_type):
     codes = pd.Series([], dtype=str)
@@ -578,33 +584,27 @@ def map(phen_dir, target_code_type):
         logger.debug(f"Length of errors from preprocess {len(errors)}")
         if len(errors) > 0:
             code_errors.extend(errors)
-        logger.debug(f" Length of code_errors {len(code_errors)}")      
-        
+        logger.debug(f" Length of code_errors {len(code_errors)}")
+
         # Map
         # if processing a source coding list with categorical data
-        if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"] and len(df) > 0:
+        if (
+            "actions" in concept_set["file"]
+            and "divide_col" in concept_set["file"]["actions"]
+            and len(df) > 0
+        ):
             divide_col = concept_set["file"]["actions"]["divide_col"]
             logger.debug(f"Action: Dividing Table by {divide_col}")
             logger.debug(f"column into: {df[divide_col].unique()}")
             df_grp = df.groupby(divide_col)
-            for cat, grp in df_grp:                 
+            for cat, grp in df_grp:
                 if cat == concept_set["file"]["category"]:
-                    grp = grp.drop(
-                        columns=[divide_col]
-                    )  # delete categorical column
+                    grp = grp.drop(columns=[divide_col])  # delete categorical column
                     out = map_file(
-                        grp,
-                        target_code_type,
-                        out,
-                        concept_name=concept_set['name']
+                        grp, target_code_type, out, concept_name=concept_set["name"]
                     )
         else:
-            out = map_file(
-                df,
-                target_code_type,
-                out,
-                concept_name=concept_set['name']
-            )                             
+            out = map_file(df, target_code_type, out, concept_name=concept_set["name"])
 
     if len(code_errors) > 0:
         logger.error(f"The map processing has {len(code_errors)} errors")
@@ -619,7 +619,7 @@ def map(phen_dir, target_code_type):
         raise Exception(
             f"No output after map processing, check config {str(config_path.resolve())}"
         )
-        
+
     # Final processing
     out = out.reset_index(drop=True)
     out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
@@ -847,9 +847,7 @@ def diff(phen_dir, phen_old_dir):
     new_config = new_phen_path / CONFIG_FILE
     with new_config.open("r") as file:
         new_config = yaml.safe_load(file)
-    report.write(
-        f"\n\n# Report for version {new_config['phenotype']['version']}\n\n"
-    )
+    report.write(f"\n\n# Report for version {new_config['phenotype']['version']}\n\n")
     report.write(f"- Removed outputs: {list(removed_outputs)}\n")
     report.write(f"- Added outputs: {list(added_outputs)}\n")
     report.write(f"- Common outputs: {list(common_outputs)}\n")
diff --git a/pyproject.toml b/pyproject.toml
index 9516588..439a5d5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,7 +62,7 @@ dependencies = [
 
 [tool.hatch.envs.dev]
 dependencies = [
-	"pydocstyle",	
+	"pydocstyle",
     "pytest",
     "black",
     "mypy"
diff --git a/tests/test_acmc.py b/tests/test_acmc.py
index e70fc0a..43b8197 100644
--- a/tests/test_acmc.py
+++ b/tests/test_acmc.py
@@ -51,7 +51,7 @@ def test_phen_init_local_specified(tmp_dir, monkeypatch, caplog):
     [
         ("config1.yaml"),  # config.yaml test case
         ("config2.yaml"),  # config.yaml test case
-        ("config3.yaml"),  # config.yaml test case        
+        ("config3.yaml"),  # config.yaml test case
     ],
 )
 def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file):
-- 
GitLab