Merge branch...

Merge branch '34-test-write-the-test-for-diff-between-phen-versions-bug-observed-with-adding-concept' into 'dev' test: added tests for diff between config 1, 2, 3 in test_diff. Updated the... Closes #34 See merge request meldb/concepts-processing!17

Merge branch...
c4a105e1 · mjbonifa · 8479e963 · ba8a7ccb · c4a105e1 · c4a105e1
Commit c4a105e1 authored 4 months ago by mjbonifa
--- a/acmc/omop.py
+++ b/acmc/omop.py
--- a/acmc/phen.py
+++ b/acmc/phen.py
@@ -12,6 +12,7 @@ import logging
 import requests
 import yaml
 from cerberus import Validator
+from deepdiff import DeepDiff
 from pathlib import Path
 from urllib.parse import urlparse, urlunparse

@@ -152,28 +153,32 @@ def create_empty_git_dir(path):
    keep_path.touch(exist_ok=True)


+def check_delete_dir(path, msg):
+    deleted = False
+
+    user_input = input(f"{msg}").strip().lower()
+    if user_input in ["yes", "y"]:
+        shutil.rmtree(path)
+        deleted = True
+    else:
+        logger.info("Directory was not deleted.")
+
+    return deleted
+
+
 def init(phen_dir, remote_url):
    """Initial phenotype directory as git repo with standard structure"""
    logger.info(f"Initialising Phenotype in directory: {phen_dir}")
    phen_path = Path(phen_dir)

    # check if directory already exists and ask user if they want to recreate it
-    configure = False
    if (
        phen_path.exists() and phen_path.is_dir()
    ):  # Check if it exists and is a directory
-        user_input = (
-            input(
-                f"The phen directory already exists. Do you want to reinitialise? (yes/no): "
-            )
-            .strip()
-            .lower()
+        configure = check_delete_dir(
+            phen_path,
+            f"The phen directory already exists. Do you want to reinitialise? (yes/no): ",
        )
-        if user_input in ["yes", "y"]:
-            shutil.rmtree(phen_path)
-            configure = True
-        else:
-            logger.info("Phen directory was not recreated.")
    else:
        configure = True

@@ -829,16 +834,22 @@ def copy(phen_dir, target_dir, version):
    copy_path = target_path / version
    logger.info(f"Copying repo {phen_path} to {copy_path}")

-    if not copy_path.exists():
-        # If copy directory doesn't exist, clone the repo
+    if (
+        copy_path.exists() and copy_path.is_dir()
+    ):  # Check if it exists and is a directory
+        copy = check_delete_dir(
+            copy_path,
+            f"The directory {str(copy_path.resolve())} already exists. Do you want to overwrite? (yes/no): ",
+        )
+    else:
+        copy = True
+
+    if not copy:
+        logger.info(f"Not copying the version {version}")
+        return
+
    logger.debug(f"Cloning repo from {phen_path} into {copy_path}...")
    repo = git.Repo.clone_from(phen_path, copy_path)
-    else:
-        # If copy directory exists, open the repo
-        logger.debug(
-            f"Copy of repository already exists in {copy_path}. Opening the repo..."
-        )
-        repo = git.Repo(copy_path)

    # Check out the latest commit or specified version
    if version:
@@ -855,27 +866,104 @@ def copy(phen_dir, target_dir, version):
    logger.info(f"Phenotype copied successfully")


-def diff(phen_dir, phen_old_dir):
-    """Compare the differences between two versions of a phenotype"""
+# Convert concept_sets list into dictionaries
+def extract_concepts(config_data):
+    """Extracts concepts as {name: file_path} dictionary and a name set."""
+    concepts_dict = {
+        item["name"]: item["file"]["path"]
+        for item in config_data["phenotype"]["concept_sets"]
+    }
+    name_set = set(concepts_dict.keys())
+    return concepts_dict, name_set

-    # validate phenotype directories
-    validate(phen_old_dir)
-    validate(phen_dir)

-    old_phen_path = Path(phen_old_dir)
-    new_phen_path = Path(phen_dir)
+def extract_clean_deepdiff_keys(diff, key_type):
+    """
+    Extracts clean keys from a DeepDiff dictionary.

-    # Load report (FOR SOME REASON THIS WAS APPEND SO SET TO w for NOW)
-    report_file_name = old_phen_path.name + "_diff.md"
-    report_path = new_phen_path / report_file_name
-    report = open(report_path, "w")
-    logger.debug(f"Writing to report file {str(report_path.resolve())}")
+    :param diff: DeepDiff result dictionary
+    :param key_type: The type of change to extract (e.g., "dictionary_item_added", "dictionary_item_removed")
+    :return: A set of clean key names
+    """
+    return {key.split("root['")[1].split("']")[0] for key in diff.get(key_type, [])}

-    # Get maps files from phenotype
-    old_map_path = old_phen_path / MAP_DIR
-    new_map_path = new_phen_path / MAP_DIR

-    # List files from output directories
+def diff_config(old_config, new_config):
+    report = f"\n# Changes to phenotype configuration\n"
+    report += f"This compares changes in the phenotype configuration including added, removed and renamed concept sets and changes to concept set source concept code file paths\n\n"
+
+    old_concepts, old_names = extract_concepts(old_config)
+    new_concepts, new_names = extract_concepts(new_config)
+
+    # Check added and removed names
+    added_names = new_names - old_names  # Names that appear in new but not in old
+    removed_names = old_names - new_names  # Names that were in old but not in new
+
+    # find file path changes for unchanged names
+    unchanged_names = old_names & new_names  # Names that exist in both
+    file_diff = DeepDiff(
+        {name: old_concepts[name] for name in unchanged_names},
+        {name: new_concepts[name] for name in unchanged_names},
+    )
+
+    # Find renamed concepts (same file, different name)
+    renamed_concepts = []
+    for removed in removed_names:
+        old_path = old_concepts[removed]
+        for added in added_names:
+            new_path = new_concepts[added]
+            if old_path == new_path:
+                renamed_concepts.append((removed, added))
+
+    # Remove renamed concepts from added and removed sets
+    for old_name, new_name in renamed_concepts:
+        added_names.discard(new_name)
+        removed_names.discard(old_name)
+
+    # generate config report
+    if added_names:
+        report += "## Added Concepts\n"
+        for name in added_names:
+            report += f"- `{name}` (File: `{new_concepts[name]}`)\n"
+        report += "\n"
+
+    if removed_names:
+        report += "## Removed Concepts\n"
+        for name in removed_names:
+            report += f"- `{name}` (File: `{old_concepts[name]}`)\n"
+        report += "\n"
+
+    if renamed_concepts:
+        report += "## Renamed Concepts\n"
+        for old_name, new_name in renamed_concepts:
+            report += (
+                f"- `{old_name}` ➝ `{new_name}` (File: `{old_concepts[old_name]}`)\n"
+            )
+        report += "\n"
+
+    if "values_changed" in file_diff:
+        report += "## Updated File Paths\n"
+        for name, change in file_diff["values_changed"].items():
+            old_file = change["old_value"]
+            new_file = change["new_value"]
+            clean_name = name.split("root['")[1].split("']")[0]
+            report += (
+                f"- `{clean_name}` changed file from `{old_file}` ➝ `{new_file}`\n"
+            )
+        report += "\n"
+
+    if not (
+        added_names
+        or removed_names
+        or renamed_concepts
+        or file_diff.get("values_changed")
+    ):
+        report += "No changes in concept sets.\n"
+
+    return report
+
+
+def diff_map_files(old_map_path, new_map_path):
    old_output_files = [
        file.name
        for file in old_map_path.iterdir()
@@ -898,19 +986,15 @@ def diff(phen_dir, phen_old_dir):
    # Outputs that are the intersection of old_output_set and new_output_set
    common_outputs = old_output_set & new_output_set

-    # Write outputs report
-    new_config = new_phen_path / CONFIG_FILE
-    with new_config.open("r") as file:
-        new_config = yaml.safe_load(file)
-    report.write(f"\n\n# Report for version {new_config['phenotype']['version']}\n\n")
-    report.write(f"- Removed outputs: {list(removed_outputs)}\n")
-    report.write(f"- Added outputs: {list(added_outputs)}\n")
-    report.write(f"- Common outputs: {list(common_outputs)}\n")
+    report = f"\n# Changes to available translations\n"
+    report += f"This compares the coding translations files available.\n\n"
+    report += f"- Removed outputs: {sorted(list(removed_outputs))}\n"
+    report += f"- Added outputs: {sorted(list(added_outputs))}\n"
+    report += f"- Common outputs: {sorted(list(common_outputs))}\n\n"

-    report.write(
-        f"\n\n## Compare concepts {str(old_phen_path.resolve())} to {str(new_phen_path.resolve())}\n\n"
-    )
-    # Compare common outputs between versions
+    # Step N: Compare common outputs between versions
+    report += f"# Changes to concepts in translation files\n\n"
+    report += f"This compares the added and removed concepts in each of the coding translation files. Note that this might be different to the config.yaml if the translations have not been run for the current config.\n\n"
    for file in common_outputs:
        old_output = old_map_path / file
        new_output = new_map_path / file
@@ -924,12 +1008,11 @@ def diff(phen_dir, phen_old_dir):
        df2 = df2[["CONCEPT", "CONCEPT_SET"]].groupby("CONCEPT_SET").count()

        # Check for added and removed concepts
-        report.write(
-            "- Removed concepts {}\n".format(list(set(df1.index) - set(df2.index)))
-        )
-        report.write(
-            "- Added concepts {}\n".format(list(set(df2.index) - set(df1.index)))
-        )
+        report += f"- File {file}\n"
+        sorted_list = sorted(list(set(df1.index) - set(df2.index)))
+        report += f"- Removed concepts {sorted_list}\n"
+        sorted_list = sorted(list(set(df2.index) - set(df1.index)))
+        report += f"- Added concepts {sorted_list}\n"

        # Check for changed concepts
        diff = df2 - df1  # diff in counts
@@ -940,8 +1023,57 @@ def diff(phen_dir, phen_old_dir):
        if len(diff.index) > 0:
            for concept, row in diff.iterrows():
                s += "\t - {} {}\n".format(concept, row["CONCEPT"])
-            report.write(f"- Changed concepts {s}\n\n")
+            report += f"- Changed concepts {s}\n\n"
        else:
-            report.write(f"- Changed concepts []\n\n")
+            report += f"- Changed concepts []\n\n"
+
+    return report
+
+
+def diff(phen_dir, phen_old_dir):
+    """Compare the differences between two versions of a phenotype"""
+
+    # validate phenotypes
+    validate(phen_old_dir)
+    validate(phen_dir)
+
+    # get old and new config
+    old_phen_path = Path(phen_old_dir)
+    old_config = old_phen_path / CONFIG_FILE
+    with old_config.open("r") as file:
+        old_config = yaml.safe_load(file)
+    new_phen_path = Path(phen_dir)
+    new_config = new_phen_path / CONFIG_FILE
+    with new_config.open("r") as file:
+        new_config = yaml.safe_load(file)
+
+    # write report heading
+    report = f"# Phenotype Comparison Report\n"
+    report += f"## Original phenotype\n"
+    report += f"  - {old_config['phenotype']['omop']['vocabulary_id']}\n"
+    report += f"  - {old_config['phenotype']['version']}\n"
+    report += f"  - {str(old_phen_path.resolve())}\n"
+    report += f"## Changed phenotype:\n"
+    report += f"  - {new_config['phenotype']['omop']['vocabulary_id']}\n"
+    report += f"  - {new_config['phenotype']['version']}\n"
+    report += f"  - {str(new_phen_path.resolve())}\n"
+
+    # Step 1: check differences configuration files
+    # Convert list of dicts into a dict: {name: file}
+    report += diff_config(old_config, new_config)
+
+    # Step 2: check differences between map files
+    # List files from output directories
+    old_map_path = old_phen_path / MAP_DIR
+    new_map_path = new_phen_path / MAP_DIR
+    report += diff_map_files(old_map_path, new_map_path)
+
+    # initialise report file
+    report_file_name = old_phen_path.name + "_diff.md"
+    report_path = new_phen_path / report_file_name
+    logger.debug(f"Writing to report file {str(report_path.resolve())}")
+    report_file = open(report_path, "w")
+    report_file.write(report)
+    report_file.close()

    logger.info(f"Phenotypes diff'd successfully")
--- a/contributing.md
+++ b/contributing.md
@@ -69,6 +69,12 @@ To run tests using `pytest`, use:
 hatch run pytest
 ```

+To run a specific test and display print outputs `-s`, use
+
+```sh
+hatch run pytest -s -v tests/test_acmc.py::<test_function_name>
+```
+
 ## Running the CLI
 The package provides a command-line interface (CLI) entry point. 


--- a/examples/config1.yaml
+++ b/examples/config1.yaml
 phenotype:
  version: "v1.0.1"
  omop:
-    vocabulary_id: "ACMC_Example"
-    vocabulary_name: "ACMC example phenotype"
+    vocabulary_id: "ACMC_Example_1"
+    vocabulary_name: "ACMC example 1 phenotype"
    vocabulary_reference: "https://git.soton.ac.uk/meldb/concepts-processing/-/tree/main/examples"
  concept_sets:
    - name: "ABDO_PAIN"

--- a/examples/config2.yaml
+++ b/examples/config2.yaml
 phenotype:
-  version: "v1.0.4"
+  version: "v1.0.1"
  omop:
-    vocabulary_id: "ACMC_Example"
-    vocabulary_name: "ACMC example phenotype"
+    vocabulary_id: "ACMC_Example_2"
+    vocabulary_name: "ACMC example 2 phenotype"
    vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
  concept_sets:
    - name: "CVD_EVENTS"

--- a/examples/config3.yaml
+++ b/examples/config3.yaml
 phenotype:
-  version: "v1.0.4"
+  version: "v1.0.1"
  omop:
-    vocabulary_id: "ACMC_Example"
-    vocabulary_name: "ACMC example phenotype"
+    vocabulary_id: "ACMC_Example_3"
+    vocabulary_name: "ACMC example 3 phenotype"
    vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
  concept_sets:
    - name: "CVD_EVENTS"

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,7 @@ dependencies = [
    "cerberus",
    "click",
    "cramjam",
+    "deepdiff",
    "et-xmlfile",
    "fastparquet",
    "fsspec",

--- a/tests/test_acmc.py
+++ b/tests/test_acmc.py
@@ -155,3 +155,168 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file):
        )
        main.main()
    assert "Phenotypes diff'd successfully" in caplog.text
+
+
+# TODO: Refactor this test so it is shortened, there's lots of duplicate codes
+def test_diff(tmp_dir, monkeypatch, caplog):
+    print(f"Temporary directory: {tmp_dir}")  # Prints path for debugging
+
+    # init phenotype
+    phen_path = tmp_dir / "phen"
+    phen_path = phen_path.resolve()
+    monkeypatch.setattr(
+        sys, "argv", ["main.py", "phen", "init", "-d", str(phen_path.resolve())]
+    )
+    # Mock input() to return "yes" to the question about reinitialising the directory
+    monkeypatch.setattr("builtins.input", lambda _: "y")
+    main.main()
+
+    # copy example codes
+    shutil.rmtree(phen_path / "codes")
+    ex_path = Path("./examples").resolve()
+    for item in ex_path.iterdir():
+        source = ex_path / item.name
+        destination = phen_path / item.name
+        if source.is_dir():
+            shutil.copytree(source, destination)
+        else:
+            shutil.copy(source, destination)
+
+    # map phenotype for config 1
+    shutil.copy(phen_path / "config1.yaml", phen_path / "config.yaml")
+    for code_type in ["read2"]:
+        with caplog.at_level(logging.DEBUG):
+            monkeypatch.setattr(
+                sys,
+                "argv",
+                [
+                    "main.py",
+                    "phen",
+                    "map",
+                    "-d",
+                    str(phen_path.resolve()),
+                    "-t",
+                    code_type,
+                ],
+            )
+            main.main()
+    assert "Phenotype processed successfully" in caplog.text
+
+    # publish phenotype
+    with caplog.at_level(logging.DEBUG):
+        monkeypatch.setattr(
+            sys, "argv", ["main.py", "phen", "publish", "-d", str(phen_path.resolve())]
+        )
+        main.main()
+    assert "Phenotype published successfully" in caplog.text
+
+    # copy phenotype'
+    with caplog.at_level(logging.DEBUG):
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "main.py",
+                "phen",
+                "copy",
+                "-d",
+                str(phen_path.resolve()),
+                "-td",
+                str(tmp_dir.resolve()),
+                "-v",
+                "v1.0.3",
+            ],
+        )
+        main.main()
+    assert "Phenotype copied successfully" in caplog.text
+
+    # map phenotype example 2
+    shutil.copy(phen_path / "config2.yaml", phen_path / "config.yaml")
+    for code_type in ["read2", "read3"]:
+        with caplog.at_level(logging.DEBUG):
+            monkeypatch.setattr(
+                sys,
+                "argv",
+                [
+                    "main.py",
+                    "phen",
+                    "map",
+                    "-d",
+                    str(phen_path.resolve()),
+                    "-t",
+                    code_type,
+                ],
+            )
+            main.main()
+    assert "Phenotype processed successfully" in caplog.text
+
+    # diff phenotype with v1.0.3
+    with caplog.at_level(logging.DEBUG):
+        old_path = tmp_dir / "v1.0.3"
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "main.py",
+                "phen",
+                "diff",
+                "-d",
+                str(phen_path.resolve()),
+                "-old",
+                str(old_path.resolve()),
+            ],
+        )
+        main.main()
+    assert "Phenotypes diff'd successfully" in caplog.text
+
+    # check changes
+    with open(phen_path / "v1.0.3_diff.md", "r") as file:
+        content = file.read()
+    assert "Removed concepts ['ABDO_PAIN']" in content
+    assert "Added concepts ['DID_NOT_ATTEND']" in content
+    assert "Added outputs: ['read3.csv']" in content
+
+    # map phenotype with example 3
+    shutil.copy(phen_path / "config3.yaml", phen_path / "config.yaml")
+    for code_type in ["read2", "read3", "snomed"]:
+        with caplog.at_level(logging.DEBUG):
+            monkeypatch.setattr(
+                sys,
+                "argv",
+                [
+                    "main.py",
+                    "phen",
+                    "map",
+                    "-d",
+                    str(phen_path.resolve()),
+                    "-t",
+                    code_type,
+                ],
+            )
+            main.main()
+    assert "Phenotype processed successfully" in caplog.text
+
+    # diff phenotype with v1.0.3
+    with caplog.at_level(logging.DEBUG):
+        old_path = tmp_dir / "v1.0.3"
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "main.py",
+                "phen",
+                "diff",
+                "-d",
+                str(phen_path.resolve()),
+                "-old",
+                str(old_path.resolve()),
+            ],
+        )
+        main.main()
+    assert "Phenotypes diff'd successfully" in caplog.text
+
+    with open(phen_path / "v1.0.3_diff.md", "r") as file:
+        content = file.read()
+    assert "Removed concepts ['ABDO_PAIN']" in content
+    assert "Added concepts ['DEPRESSION', 'DID_NOT_ATTEND', 'HYPERTENSION']" in content
+    assert "Added outputs: ['read3.csv', 'snomed.csv']" in content