refactor: converted all examples to yaml and added test for config2. Updated...

refactor: converted all examples to yaml and added test for config2. Updated readme. Fixed but in diff that assumed all csv files in map directory where map files when some where error files if no codes exists, moved error files to errors directory in the map directory to avoid this.

refactor: converted all examples to yaml and added test for config2. Updated...
9eca4250 · mjbonifa · 4f35f9ba · 9eca4250 · 9eca4250 · 4f35f9ba
Commit 9eca4250 authored 3 months ago by mjbonifa
--- a/README.md
+++ b/README.md
@@ -189,7 +189,7 @@ cp -r ./examples/codes/* ./workspace/phen/codes
 	From the command prompt, copy example phenotype configuration files `/examples/config.json` to the phenotype directory:

 ```bash
-cp -r ./examples/config.json ./workspace/phen
+cp -r ./examples/config1.yaml ./workspace/phen/config.yaml
 ```

   - You can view the configuarion file here [`config.json`](./examples/config.json) 

--- a/acmc/phen.py
+++ b/acmc/phen.py
@@ -32,7 +32,7 @@ MAP_DIR = "map"
 CONCEPT_SET_DIR = "concept-set"
 OMOP_DIR = "omop"
 DEFAULT_PHEN_DIR_LIST = [CODES_DIR, MAP_DIR, CONCEPT_SET_DIR, OMOP_DIR]
-CONFIG_FILE = "config.json"
+CONFIG_FILE = "config.yaml"
 VOCAB_VERSION_FILE = "vocab_version.yaml"

 DEFAULT_GIT_BRANCH = "main"
@@ -202,9 +202,8 @@ def init(phen_dir, remote_url):
        "codes": [],
    }

-    config_path = phen_path / CONFIG_FILE
-    with open(config_path, "w", encoding="utf-8") as f:
-        json.dump(config, f, indent=4)
+    with open(phen_path / CONFIG_FILE, "w") as file:
+        yaml.dump(config, file, default_flow_style=False, sort_keys=False)

    # add git ignore
    ignore_content = """# Ignore SQLite database files
@@ -233,7 +232,7 @@ def validate(phen_dir):
    logger.info(f"Validating phenotype: {phen_dir}")
    phen_path = Path(phen_dir)
    if not phen_path.is_dir():
-        raise NotADirectoryError(f"Error: '{phen_path}' is not a directory")
+        raise NotADirectoryError(f"Error: '{str(phen_path.resolve())}' is not a directory")

    config_path = phen_path / CONFIG_FILE
    if not config_path.is_file():
@@ -254,8 +253,9 @@ def validate(phen_dir):
        raise Exception(f"Phen directory {phen_path} is not a git repo")

    # Load configuration File
-    if config_path.suffix == ".json":
-        mapping = json.load(open(config_path, "rb"))
+    if config_path.suffix == ".yaml":
+        with config_path.open("r") as file:
+            mapping = yaml.safe_load(file)
    else:
        raise Exception(
            f"Unsupported configuration filetype: {str(config_path.resolve())}"
@@ -582,7 +582,9 @@ def map(phen_dir, target_code_type):
    codes_path = phen_path / CODES_DIR

    # load configuration
-    config = json.load(open(config_path, "rb"))
+    with config_path.open("r") as file:
+        config = yaml.safe_load(file)
+
    concept_sets = config["concept_sets"]
    codes = config["codes"]

@@ -633,6 +635,7 @@ def map(phen_dir, target_code_type):
                df = df.groupby(divide_col)

            # Map to Concept/Phenotype
+            # TODO: This code needs refactory as it seems handling of the concept_set_categories should happen at another place
            if len(df.index) != 0:
                if ("concept_set" in file) and isinstance(df, pd.core.frame.DataFrame):
                    out = map_file(
@@ -661,10 +664,10 @@ def map(phen_dir, target_code_type):
                                concepts=file["concept_set_categories"][cat],
                                meta_columns=meta_columns,
                            )
-                else:
-                    raise AttributeError(
-                        f"File {file} has either no concept_set or conceot_set_categories or the instance of dataframe objectives associated is incorrect, concept_set must be a DataFrame, conceot_set_categories must be pd.core.groupby.generic.DataFrameGroupBy"
-                    )
+                    else:
+                        raise AttributeError(
+                            f"File {file} has either no concept_set or conceot_set_categories or the instance of dataframe objectives associated is incorrect, concept_set must be a DataFrame, conceot_set_categories must be pd.core.groupby.generic.DataFrameGroupBy"
+                        )
            else:
                logger.warning(
                    f"File {file} has no output after preprocessing in config {str(config_path.resolve())}"
@@ -672,8 +675,10 @@ def map(phen_dir, target_code_type):

    if len(code_errors) > 0:
        logger.error(f"The map processing has {len(code_errors)} errors")
-        error_filename = f"{target_code_type}-code-errors.csv"
-        write_code_errors(code_errors, phen_path / MAP_DIR / error_filename)
+        error_path = phen_path / MAP_DIR / "errors"
+        error_path.mkdir(parents=True, exist_ok=True)                
+        error_filename = f"{target_code_type}-code-errors.csv"        
+        write_code_errors(code_errors, error_path / error_filename)

    # Check there is output from processing
    if len(out.index) == 0:
@@ -681,7 +686,6 @@ def map(phen_dir, target_code_type):
        raise Exception(
            f"No output after map processing, check config {str(config_path.resolve())}"
        )
-
    # Final processing
    out = out.reset_index(drop=True)
    out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
@@ -760,7 +764,8 @@ def publish(phen_dir):

    # get major version from configuration file
    config_path = phen_path / CONFIG_FILE
-    config = json.load(open(config_path, "rb"))
+    with config_path.open("r") as file:
+        config = yaml.safe_load(file)
    match = re.match(r"v(\d+\.\d+)", config["concept_sets"]["version"])
    major_version = match.group(1)

@@ -772,8 +777,8 @@ def publish(phen_dir):
    version = f"v{major_version}.{next_minor_version}"
    logger.debug(f"New version: {version}")
    config["concept_sets"]["version"] = version
-    with open(config_path, "w", encoding="utf-8") as f:
-        json.dump(config, f, indent=4)
+    with open(config_path, "w") as file:
+        yaml.dump(config, file, default_flow_style=False, sort_keys=False)

    # Add and commit changes to repo
    commit_message = f"Committing updates to phenotype {phen_path}"
@@ -808,7 +813,8 @@ def export(phen_dir, version):

    # load configuration
    config_path = phen_path / CONFIG_FILE
-    config = json.load(open(config_path, "rb"))
+    with config_path.open("r") as file:
+        config = yaml.safe_load(file)

    map_path = phen_path / MAP_DIR
    if not map_path.exists():
@@ -919,8 +925,9 @@ def diff(phen_dir, phen_old_dir):
    common_outputs = old_output_set & new_output_set

    # Write outputs report
-    new_config_path = new_phen_path / CONFIG_FILE
-    new_config = json.load(open(new_config_path, "rb"))
+    new_config = new_phen_path / CONFIG_FILE
+    with new_config.open("r") as file:
+        new_config = yaml.safe_load(file)
    report.write(
        f"\n\n# Report for version {new_config['concept_sets']['version']}\n\n"
    )
@@ -936,9 +943,12 @@ def diff(phen_dir, phen_old_dir):
        old_output = old_map_path / file
        new_output = new_map_path / file

+        logger.debug(f"Old ouptput: {str(old_output.resolve())}")
+        logger.debug(f"New ouptput: {str(new_output.resolve())}") 
+
        df1 = pd.read_csv(old_output)
        df1 = df1[["CONCEPT", "CONCEPT_SET"]].groupby("CONCEPT_SET").count()
-        df2 = pd.read_csv(new_output)
+        df2 = pd.read_csv(new_output)   
        df2 = df2[["CONCEPT", "CONCEPT_SET"]].groupby("CONCEPT_SET").count()

        # Check for added and removed concepts
@@ -963,21 +973,3 @@ def diff(phen_dir, phen_old_dir):
            report.write(f"- Changed concepts []\n\n")

    logger.info(f"Phenotypes diff'd successfully")
-
-
-# Here's the atlas code that needs to go into anotehr function
-# 	if output_path == "atlas":
-# 		vocab_id = summary_config["omop"]["vocabulary_id"]
-# 		vocab_version = summary_config["version"]
-# 		vocab_name = summary_config["omop"]["vocabulary_name"]
-# 		vocab_reference = summary_config["omop"]["vocabulary_reference"]
-
-# Create New OMOP Vocabulary
-# 		omop_setup(OMOP_DB_PATH, vocab_id, vocab_version, vocab_name, vo#cab_reference)
-
-# Export to DB
-# 		omop_publish_concept_sets(out,
-# 								  OMOP_DB_PATH,
-# 								  vocab_id,
-# 								  omop_vocab_types[target_code_type],
-# 								  vocab_version,)
--- a/examples/config.json
+++ b/examples/config.json
-{
-    "concept_sets": {
-        "version": "v1.0.1",
-        "omop": {
-            "vocabulary_id": "ACMC_Example",
-            "vocabulary_name": "ACMC example phenotype",
-            "vocabulary_reference": "https://git.soton.ac.uk/meldb/concepts-processing/-/tree/main/examples"
-        },
-        "concept_set": [
-            {
-                "concept_set_name": "ABDO_PAIN",
-                "concept_set_status": "AGREED",
-                "metadata": {
-                }
-            }
-        ]
-    },
-    "codes": [
-        {
-            "folder": "clinical-codes-org",
-            "description": "Downloaded 16/11/23",
-            "files": [
-                {
-                    "file": "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv",
-                    "columns": {
-                        "read2": "code",
-                        "metadata": [
-                            "description"
-                        ]
-                    },
-                    "concept_set": [
-                        "ABDO_PAIN"
-                    ]
-                }
-            ]
-        }
-    ]
-}
--- a/examples/config.yaml
+++ b/examples/config.yaml
--- a/examples/config2.json
+++ b/examples/config2.json
-{
-    "concept_sets": {
-        "version": "v1.0.4",
-        "omop": {
-            "vocabulary_id": "ACMC_Example",
-            "vocabulary_name": "ACMC example phenotype",
-            "vocabulary_reference": "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
-        },
-        "concept_set": [
-            {
-                "concept_set_name": "CVD_EVENTS",
-                "concept_set_status": "AGREED",
-                "metadata": {}
-            },
-            {
-                "concept_set_name": "DID_NOT_ATTEND",
-                "concept_set_status": "AGREED",
-                "metadata": {}
-            }
-        ]
-    },
-    "codes": [
-        {
-            "folder": "clinical-codes-org",
-            "description": "Downloaded 16/11/23",
-            "files": [
-                {
-                    "file": "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv",
-                    "columns": {
-                        "icd10": "code",
-                        "metadata": []
-                    },
-                    "concept_set": [
-                        "CVD_EVENTS"
-                    ]
-                },
-                {
-                    "file": "Non-attendance codes/res201-did-not-attend-appointment.csv",
-                    "columns": {
-                        "read2": "code",
-                        "metadata": []
-                    },
-                    "concept_set": [
-                        "DID_NOT_ATTEND"
-                    ]
-                }
-            ]
-        }
-    ]
-}
--- a/examples/config2.yaml
+++ b/examples/config2.yaml
+concept_sets:
+  version: "v1.0.4"
+  omop:
+    vocabulary_id: "ACMC_Example"
+    vocabulary_name: "ACMC example phenotype"
+    vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
+  concept_set:
+    - concept_set_name: "CVD_EVENTS"
+      concept_set_status: "AGREED"
+      metadata: {}
+    - concept_set_name: "DID_NOT_ATTEND"
+      concept_set_status: "AGREED"
+      metadata: {}
+
+codes:
+  - folder: "clinical-codes-org"
+    description: "Downloaded 16/11/23"
+    files:
+      - file: "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
+        columns:
+          icd10: "code"
+          metadata: []
+        concept_set:
+          - "CVD_EVENTS"
+      - file: "Non-attendance codes/res201-did-not-attend-appointment.csv"
+        columns:
+          read2: "code"
+          metadata: []
+        concept_set:
+          - "DID_NOT_ATTEND"
+
--- a/tests/test_acmc.py
+++ b/tests/test_acmc.py
@@ -44,7 +44,16 @@ def test_phen_init_local_specified(tmp_dir, monkeypatch, caplog):
    assert "Phenotype initialised successfully" in caplog.text


-def test_phen_workflow(tmp_dir, monkeypatch, caplog):
+# TODO: This test will need to be refactored so that the expected outputs match the config files
+# right now it just tests that it runs successfully and does not check the contents of the output
+@pytest.mark.parametrize("config_file", [
+    ("config1.yaml"),   # config.yaml test case
+    ("config2.yaml"),   # config.yaml test case    
+    
+])
+def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file):
+    print(f"Temporary directory: {tmp_dir}")  # Prints path for debugging
+    
    with caplog.at_level(logging.DEBUG):
        phen_path = tmp_dir / "phen"
        phen_path = phen_path.resolve()
@@ -69,6 +78,9 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog):
            else:
                shutil.copy(source, destination)

+        # copy the test file to configuration
+        shutil.copy(phen_path / config_file, phen_path / "config.yaml")                
+
        monkeypatch.setattr(
            sys, "argv", ["main.py", "phen", "validate", "-d", str(phen_path.resolve())]
        )