From 9eca4250301ca02702116abf6cfccf592943e2f9 Mon Sep 17 00:00:00 2001
From: Michael Boniface <m.j.boniface@soton.ac.uk>
Date: Mon, 24 Feb 2025 12:04:41 +0000
Subject: [PATCH] refactor: converted all examples to yaml and added test for
 config2. Updated readme. Fixed but in diff that assumed all csv files in map
 directory where map files when some where error files if no codes exists,
 moved error files to errors directory in the map directory to avoid this.

---
 README.md                              |  2 +-
 acmc/phen.py                           | 72 ++++++++++++--------------
 examples/config.json                   | 38 --------------
 examples/{config.yaml => config1.yaml} |  0
 examples/config2.json                  | 50 ------------------
 examples/config2.yaml                  | 31 +++++++++++
 tests/test_acmc.py                     | 14 ++++-
 7 files changed, 77 insertions(+), 130 deletions(-)
 delete mode 100644 examples/config.json
 rename examples/{config.yaml => config1.yaml} (100%)
 delete mode 100644 examples/config2.json
 create mode 100644 examples/config2.yaml

diff --git a/README.md b/README.md
index 1125458..5dfb6cc 100644
--- a/README.md
+++ b/README.md
@@ -189,7 +189,7 @@ cp -r ./examples/codes/* ./workspace/phen/codes
 	From the command prompt, copy example phenotype configuration files `/examples/config.json` to the phenotype directory:
 
 ```bash
-cp -r ./examples/config.json ./workspace/phen
+cp -r ./examples/config1.yaml ./workspace/phen/config.yaml
 ```
 
    - You can view the configuarion file here [`config.json`](./examples/config.json) 
diff --git a/acmc/phen.py b/acmc/phen.py
index dd2b6ff..14dc583 100644
--- a/acmc/phen.py
+++ b/acmc/phen.py
@@ -32,7 +32,7 @@ MAP_DIR = "map"
 CONCEPT_SET_DIR = "concept-set"
 OMOP_DIR = "omop"
 DEFAULT_PHEN_DIR_LIST = [CODES_DIR, MAP_DIR, CONCEPT_SET_DIR, OMOP_DIR]
-CONFIG_FILE = "config.json"
+CONFIG_FILE = "config.yaml"
 VOCAB_VERSION_FILE = "vocab_version.yaml"
 
 DEFAULT_GIT_BRANCH = "main"
@@ -202,9 +202,8 @@ def init(phen_dir, remote_url):
         "codes": [],
     }
 
-    config_path = phen_path / CONFIG_FILE
-    with open(config_path, "w", encoding="utf-8") as f:
-        json.dump(config, f, indent=4)
+    with open(phen_path / CONFIG_FILE, "w") as file:
+        yaml.dump(config, file, default_flow_style=False, sort_keys=False)
 
     # add git ignore
     ignore_content = """# Ignore SQLite database files
@@ -233,7 +232,7 @@ def validate(phen_dir):
     logger.info(f"Validating phenotype: {phen_dir}")
     phen_path = Path(phen_dir)
     if not phen_path.is_dir():
-        raise NotADirectoryError(f"Error: '{phen_path}' is not a directory")
+        raise NotADirectoryError(f"Error: '{str(phen_path.resolve())}' is not a directory")
 
     config_path = phen_path / CONFIG_FILE
     if not config_path.is_file():
@@ -254,8 +253,9 @@ def validate(phen_dir):
         raise Exception(f"Phen directory {phen_path} is not a git repo")
 
     # Load configuration File
-    if config_path.suffix == ".json":
-        mapping = json.load(open(config_path, "rb"))
+    if config_path.suffix == ".yaml":
+        with config_path.open("r") as file:
+            mapping = yaml.safe_load(file)
     else:
         raise Exception(
             f"Unsupported configuration filetype: {str(config_path.resolve())}"
@@ -582,7 +582,9 @@ def map(phen_dir, target_code_type):
     codes_path = phen_path / CODES_DIR
 
     # load configuration
-    config = json.load(open(config_path, "rb"))
+    with config_path.open("r") as file:
+        config = yaml.safe_load(file)
+
     concept_sets = config["concept_sets"]
     codes = config["codes"]
 
@@ -633,6 +635,7 @@ def map(phen_dir, target_code_type):
                 df = df.groupby(divide_col)
 
             # Map to Concept/Phenotype
+            # TODO: This code needs refactory as it seems handling of the concept_set_categories should happen at another place
             if len(df.index) != 0:
                 if ("concept_set" in file) and isinstance(df, pd.core.frame.DataFrame):
                     out = map_file(
@@ -661,10 +664,10 @@ def map(phen_dir, target_code_type):
                                 concepts=file["concept_set_categories"][cat],
                                 meta_columns=meta_columns,
                             )
-                else:
-                    raise AttributeError(
-                        f"File {file} has either no concept_set or conceot_set_categories or the instance of dataframe objectives associated is incorrect, concept_set must be a DataFrame, conceot_set_categories must be pd.core.groupby.generic.DataFrameGroupBy"
-                    )
+                    else:
+                        raise AttributeError(
+                            f"File {file} has either no concept_set or conceot_set_categories or the instance of dataframe objectives associated is incorrect, concept_set must be a DataFrame, conceot_set_categories must be pd.core.groupby.generic.DataFrameGroupBy"
+                        )
             else:
                 logger.warning(
                     f"File {file} has no output after preprocessing in config {str(config_path.resolve())}"
@@ -672,8 +675,10 @@ def map(phen_dir, target_code_type):
 
     if len(code_errors) > 0:
         logger.error(f"The map processing has {len(code_errors)} errors")
-        error_filename = f"{target_code_type}-code-errors.csv"
-        write_code_errors(code_errors, phen_path / MAP_DIR / error_filename)
+        error_path = phen_path / MAP_DIR / "errors"
+        error_path.mkdir(parents=True, exist_ok=True)                
+        error_filename = f"{target_code_type}-code-errors.csv"        
+        write_code_errors(code_errors, error_path / error_filename)
 
     # Check there is output from processing
     if len(out.index) == 0:
@@ -681,7 +686,6 @@ def map(phen_dir, target_code_type):
         raise Exception(
             f"No output after map processing, check config {str(config_path.resolve())}"
         )
-
     # Final processing
     out = out.reset_index(drop=True)
     out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
@@ -760,7 +764,8 @@ def publish(phen_dir):
 
     # get major version from configuration file
     config_path = phen_path / CONFIG_FILE
-    config = json.load(open(config_path, "rb"))
+    with config_path.open("r") as file:
+        config = yaml.safe_load(file)
     match = re.match(r"v(\d+\.\d+)", config["concept_sets"]["version"])
     major_version = match.group(1)
 
@@ -772,8 +777,8 @@ def publish(phen_dir):
     version = f"v{major_version}.{next_minor_version}"
     logger.debug(f"New version: {version}")
     config["concept_sets"]["version"] = version
-    with open(config_path, "w", encoding="utf-8") as f:
-        json.dump(config, f, indent=4)
+    with open(config_path, "w") as file:
+        yaml.dump(config, file, default_flow_style=False, sort_keys=False)
 
     # Add and commit changes to repo
     commit_message = f"Committing updates to phenotype {phen_path}"
@@ -808,7 +813,8 @@ def export(phen_dir, version):
 
     # load configuration
     config_path = phen_path / CONFIG_FILE
-    config = json.load(open(config_path, "rb"))
+    with config_path.open("r") as file:
+        config = yaml.safe_load(file)
 
     map_path = phen_path / MAP_DIR
     if not map_path.exists():
@@ -919,8 +925,9 @@ def diff(phen_dir, phen_old_dir):
     common_outputs = old_output_set & new_output_set
 
     # Write outputs report
-    new_config_path = new_phen_path / CONFIG_FILE
-    new_config = json.load(open(new_config_path, "rb"))
+    new_config = new_phen_path / CONFIG_FILE
+    with new_config.open("r") as file:
+        new_config = yaml.safe_load(file)
     report.write(
         f"\n\n# Report for version {new_config['concept_sets']['version']}\n\n"
     )
@@ -936,9 +943,12 @@ def diff(phen_dir, phen_old_dir):
         old_output = old_map_path / file
         new_output = new_map_path / file
 
+        logger.debug(f"Old ouptput: {str(old_output.resolve())}")
+        logger.debug(f"New ouptput: {str(new_output.resolve())}") 
+
         df1 = pd.read_csv(old_output)
         df1 = df1[["CONCEPT", "CONCEPT_SET"]].groupby("CONCEPT_SET").count()
-        df2 = pd.read_csv(new_output)
+        df2 = pd.read_csv(new_output)   
         df2 = df2[["CONCEPT", "CONCEPT_SET"]].groupby("CONCEPT_SET").count()
 
         # Check for added and removed concepts
@@ -963,21 +973,3 @@ def diff(phen_dir, phen_old_dir):
             report.write(f"- Changed concepts []\n\n")
 
     logger.info(f"Phenotypes diff'd successfully")
-
-
-# Here's the atlas code that needs to go into anotehr function
-# 	if output_path == "atlas":
-# 		vocab_id = summary_config["omop"]["vocabulary_id"]
-# 		vocab_version = summary_config["version"]
-# 		vocab_name = summary_config["omop"]["vocabulary_name"]
-# 		vocab_reference = summary_config["omop"]["vocabulary_reference"]
-
-# Create New OMOP Vocabulary
-# 		omop_setup(OMOP_DB_PATH, vocab_id, vocab_version, vocab_name, vo#cab_reference)
-
-# Export to DB
-# 		omop_publish_concept_sets(out,
-# 								  OMOP_DB_PATH,
-# 								  vocab_id,
-# 								  omop_vocab_types[target_code_type],
-# 								  vocab_version,)
diff --git a/examples/config.json b/examples/config.json
deleted file mode 100644
index 2acfeae..0000000
--- a/examples/config.json
+++ /dev/null
@@ -1,38 +0,0 @@
-{
-    "concept_sets": {
-        "version": "v1.0.1",
-        "omop": {
-            "vocabulary_id": "ACMC_Example",
-            "vocabulary_name": "ACMC example phenotype",
-            "vocabulary_reference": "https://git.soton.ac.uk/meldb/concepts-processing/-/tree/main/examples"
-        },
-        "concept_set": [
-            {
-                "concept_set_name": "ABDO_PAIN",
-                "concept_set_status": "AGREED",
-                "metadata": {
-                }
-            }
-        ]
-    },
-    "codes": [
-        {
-            "folder": "clinical-codes-org",
-            "description": "Downloaded 16/11/23",
-            "files": [
-                {
-                    "file": "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv",
-                    "columns": {
-                        "read2": "code",
-                        "metadata": [
-                            "description"
-                        ]
-                    },
-                    "concept_set": [
-                        "ABDO_PAIN"
-                    ]
-                }
-            ]
-        }
-    ]
-}
diff --git a/examples/config.yaml b/examples/config1.yaml
similarity index 100%
rename from examples/config.yaml
rename to examples/config1.yaml
diff --git a/examples/config2.json b/examples/config2.json
deleted file mode 100644
index 42578cf..0000000
--- a/examples/config2.json
+++ /dev/null
@@ -1,50 +0,0 @@
-{
-    "concept_sets": {
-        "version": "v1.0.4",
-        "omop": {
-            "vocabulary_id": "ACMC_Example",
-            "vocabulary_name": "ACMC example phenotype",
-            "vocabulary_reference": "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
-        },
-        "concept_set": [
-            {
-                "concept_set_name": "CVD_EVENTS",
-                "concept_set_status": "AGREED",
-                "metadata": {}
-            },
-            {
-                "concept_set_name": "DID_NOT_ATTEND",
-                "concept_set_status": "AGREED",
-                "metadata": {}
-            }
-        ]
-    },
-    "codes": [
-        {
-            "folder": "clinical-codes-org",
-            "description": "Downloaded 16/11/23",
-            "files": [
-                {
-                    "file": "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv",
-                    "columns": {
-                        "icd10": "code",
-                        "metadata": []
-                    },
-                    "concept_set": [
-                        "CVD_EVENTS"
-                    ]
-                },
-                {
-                    "file": "Non-attendance codes/res201-did-not-attend-appointment.csv",
-                    "columns": {
-                        "read2": "code",
-                        "metadata": []
-                    },
-                    "concept_set": [
-                        "DID_NOT_ATTEND"
-                    ]
-                }
-            ]
-        }
-    ]
-}
diff --git a/examples/config2.yaml b/examples/config2.yaml
new file mode 100644
index 0000000..ee36425
--- /dev/null
+++ b/examples/config2.yaml
@@ -0,0 +1,31 @@
+concept_sets:
+  version: "v1.0.4"
+  omop:
+    vocabulary_id: "ACMC_Example"
+    vocabulary_name: "ACMC example phenotype"
+    vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
+  concept_set:
+    - concept_set_name: "CVD_EVENTS"
+      concept_set_status: "AGREED"
+      metadata: {}
+    - concept_set_name: "DID_NOT_ATTEND"
+      concept_set_status: "AGREED"
+      metadata: {}
+
+codes:
+  - folder: "clinical-codes-org"
+    description: "Downloaded 16/11/23"
+    files:
+      - file: "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
+        columns:
+          icd10: "code"
+          metadata: []
+        concept_set:
+          - "CVD_EVENTS"
+      - file: "Non-attendance codes/res201-did-not-attend-appointment.csv"
+        columns:
+          read2: "code"
+          metadata: []
+        concept_set:
+          - "DID_NOT_ATTEND"
+
diff --git a/tests/test_acmc.py b/tests/test_acmc.py
index ecdadae..a89f0e6 100644
--- a/tests/test_acmc.py
+++ b/tests/test_acmc.py
@@ -44,7 +44,16 @@ def test_phen_init_local_specified(tmp_dir, monkeypatch, caplog):
     assert "Phenotype initialised successfully" in caplog.text
 
 
-def test_phen_workflow(tmp_dir, monkeypatch, caplog):
+# TODO: This test will need to be refactored so that the expected outputs match the config files
+# right now it just tests that it runs successfully and does not check the contents of the output
+@pytest.mark.parametrize("config_file", [
+    ("config1.yaml"),   # config.yaml test case
+    ("config2.yaml"),   # config.yaml test case    
+    
+])
+def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file):
+    print(f"Temporary directory: {tmp_dir}")  # Prints path for debugging
+    
     with caplog.at_level(logging.DEBUG):
         phen_path = tmp_dir / "phen"
         phen_path = phen_path.resolve()
@@ -69,6 +78,9 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog):
             else:
                 shutil.copy(source, destination)
 
+        # copy the test file to configuration
+        shutil.copy(phen_path / config_file, phen_path / "config.yaml")                
+
         monkeypatch.setattr(
             sys, "argv", ["main.py", "phen", "validate", "-d", str(phen_path.resolve())]
         )
-- 
GitLab