From ba8a7ccbd5b8417c93bd1d7bb0e54217e8f48c4b Mon Sep 17 00:00:00 2001
From: Michael Boniface <m.j.boniface@soton.ac.uk>
Date: Wed, 26 Feb 2025 17:50:07 +0000
Subject: [PATCH] test: added tests for diff between config 1, 2, 3 in
 test_diff. Updated the diff report so that it also compares the configuration
 files using deepdiff which can ccompare dict structures. closes #34

---
 acmc/omop.py          |  22 ++--
 acmc/phen.py          | 246 ++++++++++++++++++++++++++++++++----------
 contributing.md       |   6 ++
 examples/config1.yaml |   4 +-
 examples/config2.yaml |   6 +-
 examples/config3.yaml |   6 +-
 pyproject.toml        |   1 +
 tests/test_acmc.py    | 165 ++++++++++++++++++++++++++++
 8 files changed, 380 insertions(+), 76 deletions(-)

diff --git a/acmc/omop.py b/acmc/omop.py
index baf6e7f..744dc49 100644
--- a/acmc/omop.py
+++ b/acmc/omop.py
@@ -26,17 +26,17 @@ vocabularies = {
     "url": "https://athena.ohdsi.org/vocabulary/list",
     "version": "",
     "vocabularies": [
-        {"id": 1, "name": "SNOMED"}, # No license required
-        {"id": 2, "name": "ICD9CM"}, # No license required
-        {"id": 17, "name": "Readv2"}, # No license required
-        {"id": 21, "name": "ATC"}, # No license required
-        {"id": 55, "name": "OPCS4"}, # No license required
-        {"id": 57, "name": "HES Specialty"}, # No license required
-        {"id": 70, "name": "ICD10CM"}, # No license required
-        {"id": 75, "name": "dm+d"}, # No license required
-        {"id": 144, "name": "UK Biobank"}, # No license required
-        {"id": 154, "name": "NHS Ethnic Category"}, # No license required
-        {"id": 155, "name": "NHS Place of Service"}, # No license required
+        {"id": 1, "name": "SNOMED"},  # No license required
+        {"id": 2, "name": "ICD9CM"},  # No license required
+        {"id": 17, "name": "Readv2"},  # No license required
+        {"id": 21, "name": "ATC"},  # No license required
+        {"id": 55, "name": "OPCS4"},  # No license required
+        {"id": 57, "name": "HES Specialty"},  # No license required
+        {"id": 70, "name": "ICD10CM"},  # No license required
+        {"id": 75, "name": "dm+d"},  # No license required
+        {"id": 144, "name": "UK Biobank"},  # No license required
+        {"id": 154, "name": "NHS Ethnic Category"},  # No license required
+        {"id": 155, "name": "NHS Place of Service"},  # No license required
     ],
     "tables": [],
 }
diff --git a/acmc/phen.py b/acmc/phen.py
index 39012ed..0b54d16 100644
--- a/acmc/phen.py
+++ b/acmc/phen.py
@@ -12,6 +12,7 @@ import logging
 import requests
 import yaml
 from cerberus import Validator
+from deepdiff import DeepDiff
 from pathlib import Path
 from urllib.parse import urlparse, urlunparse
 
@@ -152,28 +153,32 @@ def create_empty_git_dir(path):
     keep_path.touch(exist_ok=True)
 
 
+def check_delete_dir(path, msg):
+    deleted = False
+
+    user_input = input(f"{msg}").strip().lower()
+    if user_input in ["yes", "y"]:
+        shutil.rmtree(path)
+        deleted = True
+    else:
+        logger.info("Directory was not deleted.")
+
+    return deleted
+
+
 def init(phen_dir, remote_url):
     """Initial phenotype directory as git repo with standard structure"""
     logger.info(f"Initialising Phenotype in directory: {phen_dir}")
     phen_path = Path(phen_dir)
 
     # check if directory already exists and ask user if they want to recreate it
-    configure = False
     if (
         phen_path.exists() and phen_path.is_dir()
     ):  # Check if it exists and is a directory
-        user_input = (
-            input(
-                f"The phen directory already exists. Do you want to reinitialise? (yes/no): "
-            )
-            .strip()
-            .lower()
+        configure = check_delete_dir(
+            phen_path,
+            f"The phen directory already exists. Do you want to reinitialise? (yes/no): ",
         )
-        if user_input in ["yes", "y"]:
-            shutil.rmtree(phen_path)
-            configure = True
-        else:
-            logger.info("Phen directory was not recreated.")
     else:
         configure = True
 
@@ -829,16 +834,22 @@ def copy(phen_dir, target_dir, version):
     copy_path = target_path / version
     logger.info(f"Copying repo {phen_path} to {copy_path}")
 
-    if not copy_path.exists():
-        # If copy directory doesn't exist, clone the repo
-        logger.debug(f"Cloning repo from {phen_path} into {copy_path}...")
-        repo = git.Repo.clone_from(phen_path, copy_path)
-    else:
-        # If copy directory exists, open the repo
-        logger.debug(
-            f"Copy of repository already exists in {copy_path}. Opening the repo..."
+    if (
+        copy_path.exists() and copy_path.is_dir()
+    ):  # Check if it exists and is a directory
+        copy = check_delete_dir(
+            copy_path,
+            f"The directory {str(copy_path.resolve())} already exists. Do you want to overwrite? (yes/no): ",
         )
-        repo = git.Repo(copy_path)
+    else:
+        copy = True
+
+    if not copy:
+        logger.info(f"Not copying the version {version}")
+        return
+
+    logger.debug(f"Cloning repo from {phen_path} into {copy_path}...")
+    repo = git.Repo.clone_from(phen_path, copy_path)
 
     # Check out the latest commit or specified version
     if version:
@@ -855,27 +866,104 @@ def copy(phen_dir, target_dir, version):
     logger.info(f"Phenotype copied successfully")
 
 
-def diff(phen_dir, phen_old_dir):
-    """Compare the differences between two versions of a phenotype"""
+# Convert concept_sets list into dictionaries
+def extract_concepts(config_data):
+    """Extracts concepts as {name: file_path} dictionary and a name set."""
+    concepts_dict = {
+        item["name"]: item["file"]["path"]
+        for item in config_data["phenotype"]["concept_sets"]
+    }
+    name_set = set(concepts_dict.keys())
+    return concepts_dict, name_set
 
-    # validate phenotype directories
-    validate(phen_old_dir)
-    validate(phen_dir)
 
-    old_phen_path = Path(phen_old_dir)
-    new_phen_path = Path(phen_dir)
+def extract_clean_deepdiff_keys(diff, key_type):
+    """
+    Extracts clean keys from a DeepDiff dictionary.
 
-    # Load report (FOR SOME REASON THIS WAS APPEND SO SET TO w for NOW)
-    report_file_name = old_phen_path.name + "_diff.md"
-    report_path = new_phen_path / report_file_name
-    report = open(report_path, "w")
-    logger.debug(f"Writing to report file {str(report_path.resolve())}")
+    :param diff: DeepDiff result dictionary
+    :param key_type: The type of change to extract (e.g., "dictionary_item_added", "dictionary_item_removed")
+    :return: A set of clean key names
+    """
+    return {key.split("root['")[1].split("']")[0] for key in diff.get(key_type, [])}
 
-    # Get maps files from phenotype
-    old_map_path = old_phen_path / MAP_DIR
-    new_map_path = new_phen_path / MAP_DIR
 
-    # List files from output directories
+def diff_config(old_config, new_config):
+    report = f"\n# Changes to phenotype configuration\n"
+    report += f"This compares changes in the phenotype configuration including added, removed and renamed concept sets and changes to concept set source concept code file paths\n\n"
+
+    old_concepts, old_names = extract_concepts(old_config)
+    new_concepts, new_names = extract_concepts(new_config)
+
+    # Check added and removed names
+    added_names = new_names - old_names  # Names that appear in new but not in old
+    removed_names = old_names - new_names  # Names that were in old but not in new
+
+    # find file path changes for unchanged names
+    unchanged_names = old_names & new_names  # Names that exist in both
+    file_diff = DeepDiff(
+        {name: old_concepts[name] for name in unchanged_names},
+        {name: new_concepts[name] for name in unchanged_names},
+    )
+
+    # Find renamed concepts (same file, different name)
+    renamed_concepts = []
+    for removed in removed_names:
+        old_path = old_concepts[removed]
+        for added in added_names:
+            new_path = new_concepts[added]
+            if old_path == new_path:
+                renamed_concepts.append((removed, added))
+
+    # Remove renamed concepts from added and removed sets
+    for old_name, new_name in renamed_concepts:
+        added_names.discard(new_name)
+        removed_names.discard(old_name)
+
+    # generate config report
+    if added_names:
+        report += "## Added Concepts\n"
+        for name in added_names:
+            report += f"- `{name}` (File: `{new_concepts[name]}`)\n"
+        report += "\n"
+
+    if removed_names:
+        report += "## Removed Concepts\n"
+        for name in removed_names:
+            report += f"- `{name}` (File: `{old_concepts[name]}`)\n"
+        report += "\n"
+
+    if renamed_concepts:
+        report += "## Renamed Concepts\n"
+        for old_name, new_name in renamed_concepts:
+            report += (
+                f"- `{old_name}` ➝ `{new_name}` (File: `{old_concepts[old_name]}`)\n"
+            )
+        report += "\n"
+
+    if "values_changed" in file_diff:
+        report += "## Updated File Paths\n"
+        for name, change in file_diff["values_changed"].items():
+            old_file = change["old_value"]
+            new_file = change["new_value"]
+            clean_name = name.split("root['")[1].split("']")[0]
+            report += (
+                f"- `{clean_name}` changed file from `{old_file}` ➝ `{new_file}`\n"
+            )
+        report += "\n"
+
+    if not (
+        added_names
+        or removed_names
+        or renamed_concepts
+        or file_diff.get("values_changed")
+    ):
+        report += "No changes in concept sets.\n"
+
+    return report
+
+
+def diff_map_files(old_map_path, new_map_path):
     old_output_files = [
         file.name
         for file in old_map_path.iterdir()
@@ -898,19 +986,15 @@ def diff(phen_dir, phen_old_dir):
     # Outputs that are the intersection of old_output_set and new_output_set
     common_outputs = old_output_set & new_output_set
 
-    # Write outputs report
-    new_config = new_phen_path / CONFIG_FILE
-    with new_config.open("r") as file:
-        new_config = yaml.safe_load(file)
-    report.write(f"\n\n# Report for version {new_config['phenotype']['version']}\n\n")
-    report.write(f"- Removed outputs: {list(removed_outputs)}\n")
-    report.write(f"- Added outputs: {list(added_outputs)}\n")
-    report.write(f"- Common outputs: {list(common_outputs)}\n")
+    report = f"\n# Changes to available translations\n"
+    report += f"This compares the coding translations files available.\n\n"
+    report += f"- Removed outputs: {sorted(list(removed_outputs))}\n"
+    report += f"- Added outputs: {sorted(list(added_outputs))}\n"
+    report += f"- Common outputs: {sorted(list(common_outputs))}\n\n"
 
-    report.write(
-        f"\n\n## Compare concepts {str(old_phen_path.resolve())} to {str(new_phen_path.resolve())}\n\n"
-    )
-    # Compare common outputs between versions
+    # Step N: Compare common outputs between versions
+    report += f"# Changes to concepts in translation files\n\n"
+    report += f"This compares the added and removed concepts in each of the coding translation files. Note that this might be different to the config.yaml if the translations have not been run for the current config.\n\n"
     for file in common_outputs:
         old_output = old_map_path / file
         new_output = new_map_path / file
@@ -924,12 +1008,11 @@ def diff(phen_dir, phen_old_dir):
         df2 = df2[["CONCEPT", "CONCEPT_SET"]].groupby("CONCEPT_SET").count()
 
         # Check for added and removed concepts
-        report.write(
-            "- Removed concepts {}\n".format(list(set(df1.index) - set(df2.index)))
-        )
-        report.write(
-            "- Added concepts {}\n".format(list(set(df2.index) - set(df1.index)))
-        )
+        report += f"- File {file}\n"
+        sorted_list = sorted(list(set(df1.index) - set(df2.index)))
+        report += f"- Removed concepts {sorted_list}\n"
+        sorted_list = sorted(list(set(df2.index) - set(df1.index)))
+        report += f"- Added concepts {sorted_list}\n"
 
         # Check for changed concepts
         diff = df2 - df1  # diff in counts
@@ -940,8 +1023,57 @@ def diff(phen_dir, phen_old_dir):
         if len(diff.index) > 0:
             for concept, row in diff.iterrows():
                 s += "\t - {} {}\n".format(concept, row["CONCEPT"])
-            report.write(f"- Changed concepts {s}\n\n")
+            report += f"- Changed concepts {s}\n\n"
         else:
-            report.write(f"- Changed concepts []\n\n")
+            report += f"- Changed concepts []\n\n"
+
+    return report
+
+
+def diff(phen_dir, phen_old_dir):
+    """Compare the differences between two versions of a phenotype"""
+
+    # validate phenotypes
+    validate(phen_old_dir)
+    validate(phen_dir)
+
+    # get old and new config
+    old_phen_path = Path(phen_old_dir)
+    old_config = old_phen_path / CONFIG_FILE
+    with old_config.open("r") as file:
+        old_config = yaml.safe_load(file)
+    new_phen_path = Path(phen_dir)
+    new_config = new_phen_path / CONFIG_FILE
+    with new_config.open("r") as file:
+        new_config = yaml.safe_load(file)
+
+    # write report heading
+    report = f"# Phenotype Comparison Report\n"
+    report += f"## Original phenotype\n"
+    report += f"  - {old_config['phenotype']['omop']['vocabulary_id']}\n"
+    report += f"  - {old_config['phenotype']['version']}\n"
+    report += f"  - {str(old_phen_path.resolve())}\n"
+    report += f"## Changed phenotype:\n"
+    report += f"  - {new_config['phenotype']['omop']['vocabulary_id']}\n"
+    report += f"  - {new_config['phenotype']['version']}\n"
+    report += f"  - {str(new_phen_path.resolve())}\n"
+
+    # Step 1: check differences configuration files
+    # Convert list of dicts into a dict: {name: file}
+    report += diff_config(old_config, new_config)
+
+    # Step 2: check differences between map files
+    # List files from output directories
+    old_map_path = old_phen_path / MAP_DIR
+    new_map_path = new_phen_path / MAP_DIR
+    report += diff_map_files(old_map_path, new_map_path)
+
+    # initialise report file
+    report_file_name = old_phen_path.name + "_diff.md"
+    report_path = new_phen_path / report_file_name
+    logger.debug(f"Writing to report file {str(report_path.resolve())}")
+    report_file = open(report_path, "w")
+    report_file.write(report)
+    report_file.close()
 
     logger.info(f"Phenotypes diff'd successfully")
diff --git a/contributing.md b/contributing.md
index 9443a71..d42e29d 100644
--- a/contributing.md
+++ b/contributing.md
@@ -69,6 +69,12 @@ To run tests using `pytest`, use:
 hatch run pytest
 ```
 
+To run a specific test and display print outputs `-s`, use
+
+```sh
+hatch run pytest -s -v tests/test_acmc.py::<test_function_name>
+```
+
 ## Running the CLI
 The package provides a command-line interface (CLI) entry point. 
 
diff --git a/examples/config1.yaml b/examples/config1.yaml
index 2f446aa..19fd9c6 100644
--- a/examples/config1.yaml
+++ b/examples/config1.yaml
@@ -1,8 +1,8 @@
 phenotype:
   version: "v1.0.1"
   omop:
-    vocabulary_id: "ACMC_Example"
-    vocabulary_name: "ACMC example phenotype"
+    vocabulary_id: "ACMC_Example_1"
+    vocabulary_name: "ACMC example 1 phenotype"
     vocabulary_reference: "https://git.soton.ac.uk/meldb/concepts-processing/-/tree/main/examples"
   concept_sets:
     - name: "ABDO_PAIN"
diff --git a/examples/config2.yaml b/examples/config2.yaml
index 24acf96..33d6df4 100644
--- a/examples/config2.yaml
+++ b/examples/config2.yaml
@@ -1,8 +1,8 @@
 phenotype:
-  version: "v1.0.4"
+  version: "v1.0.1"
   omop:
-    vocabulary_id: "ACMC_Example"
-    vocabulary_name: "ACMC example phenotype"
+    vocabulary_id: "ACMC_Example_2"
+    vocabulary_name: "ACMC example 2 phenotype"
     vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
   concept_sets:
     - name: "CVD_EVENTS"
diff --git a/examples/config3.yaml b/examples/config3.yaml
index 411606a..926ab60 100644
--- a/examples/config3.yaml
+++ b/examples/config3.yaml
@@ -1,8 +1,8 @@
 phenotype:
-  version: "v1.0.4"
+  version: "v1.0.1"
   omop:
-    vocabulary_id: "ACMC_Example"
-    vocabulary_name: "ACMC example phenotype"
+    vocabulary_id: "ACMC_Example_3"
+    vocabulary_name: "ACMC example 3 phenotype"
     vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
   concept_sets:
     - name: "CVD_EVENTS"
diff --git a/pyproject.toml b/pyproject.toml
index de414cf..c340e83 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,7 @@ dependencies = [
     "cerberus",
     "click",
     "cramjam",
+    "deepdiff",
     "et-xmlfile",
     "fastparquet",
     "fsspec",
diff --git a/tests/test_acmc.py b/tests/test_acmc.py
index 0ad6f86..6533847 100644
--- a/tests/test_acmc.py
+++ b/tests/test_acmc.py
@@ -155,3 +155,168 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file):
         )
         main.main()
     assert "Phenotypes diff'd successfully" in caplog.text
+
+
+# TODO: Refactor this test so it is shortened, there's lots of duplicate codes
+def test_diff(tmp_dir, monkeypatch, caplog):
+    print(f"Temporary directory: {tmp_dir}")  # Prints path for debugging
+
+    # init phenotype
+    phen_path = tmp_dir / "phen"
+    phen_path = phen_path.resolve()
+    monkeypatch.setattr(
+        sys, "argv", ["main.py", "phen", "init", "-d", str(phen_path.resolve())]
+    )
+    # Mock input() to return "yes" to the question about reinitialising the directory
+    monkeypatch.setattr("builtins.input", lambda _: "y")
+    main.main()
+
+    # copy example codes
+    shutil.rmtree(phen_path / "codes")
+    ex_path = Path("./examples").resolve()
+    for item in ex_path.iterdir():
+        source = ex_path / item.name
+        destination = phen_path / item.name
+        if source.is_dir():
+            shutil.copytree(source, destination)
+        else:
+            shutil.copy(source, destination)
+
+    # map phenotype for config 1
+    shutil.copy(phen_path / "config1.yaml", phen_path / "config.yaml")
+    for code_type in ["read2"]:
+        with caplog.at_level(logging.DEBUG):
+            monkeypatch.setattr(
+                sys,
+                "argv",
+                [
+                    "main.py",
+                    "phen",
+                    "map",
+                    "-d",
+                    str(phen_path.resolve()),
+                    "-t",
+                    code_type,
+                ],
+            )
+            main.main()
+    assert "Phenotype processed successfully" in caplog.text
+
+    # publish phenotype
+    with caplog.at_level(logging.DEBUG):
+        monkeypatch.setattr(
+            sys, "argv", ["main.py", "phen", "publish", "-d", str(phen_path.resolve())]
+        )
+        main.main()
+    assert "Phenotype published successfully" in caplog.text
+
+    # copy phenotype'
+    with caplog.at_level(logging.DEBUG):
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "main.py",
+                "phen",
+                "copy",
+                "-d",
+                str(phen_path.resolve()),
+                "-td",
+                str(tmp_dir.resolve()),
+                "-v",
+                "v1.0.3",
+            ],
+        )
+        main.main()
+    assert "Phenotype copied successfully" in caplog.text
+
+    # map phenotype example 2
+    shutil.copy(phen_path / "config2.yaml", phen_path / "config.yaml")
+    for code_type in ["read2", "read3"]:
+        with caplog.at_level(logging.DEBUG):
+            monkeypatch.setattr(
+                sys,
+                "argv",
+                [
+                    "main.py",
+                    "phen",
+                    "map",
+                    "-d",
+                    str(phen_path.resolve()),
+                    "-t",
+                    code_type,
+                ],
+            )
+            main.main()
+    assert "Phenotype processed successfully" in caplog.text
+
+    # diff phenotype with v1.0.3
+    with caplog.at_level(logging.DEBUG):
+        old_path = tmp_dir / "v1.0.3"
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "main.py",
+                "phen",
+                "diff",
+                "-d",
+                str(phen_path.resolve()),
+                "-old",
+                str(old_path.resolve()),
+            ],
+        )
+        main.main()
+    assert "Phenotypes diff'd successfully" in caplog.text
+
+    # check changes
+    with open(phen_path / "v1.0.3_diff.md", "r") as file:
+        content = file.read()
+    assert "Removed concepts ['ABDO_PAIN']" in content
+    assert "Added concepts ['DID_NOT_ATTEND']" in content
+    assert "Added outputs: ['read3.csv']" in content
+
+    # map phenotype with example 3
+    shutil.copy(phen_path / "config3.yaml", phen_path / "config.yaml")
+    for code_type in ["read2", "read3", "snomed"]:
+        with caplog.at_level(logging.DEBUG):
+            monkeypatch.setattr(
+                sys,
+                "argv",
+                [
+                    "main.py",
+                    "phen",
+                    "map",
+                    "-d",
+                    str(phen_path.resolve()),
+                    "-t",
+                    code_type,
+                ],
+            )
+            main.main()
+    assert "Phenotype processed successfully" in caplog.text
+
+    # diff phenotype with v1.0.3
+    with caplog.at_level(logging.DEBUG):
+        old_path = tmp_dir / "v1.0.3"
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "main.py",
+                "phen",
+                "diff",
+                "-d",
+                str(phen_path.resolve()),
+                "-old",
+                str(old_path.resolve()),
+            ],
+        )
+        main.main()
+    assert "Phenotypes diff'd successfully" in caplog.text
+
+    with open(phen_path / "v1.0.3_diff.md", "r") as file:
+        content = file.read()
+    assert "Removed concepts ['ABDO_PAIN']" in content
+    assert "Added concepts ['DEPRESSION', 'DID_NOT_ATTEND', 'HYPERTENSION']" in content
+    assert "Added outputs: ['read3.csv', 'snomed.csv']" in content
-- 
GitLab