Skip to content
Snippets Groups Projects
Commit c4a105e1 authored by mjbonifa's avatar mjbonifa
Browse files

Merge branch...

Merge branch '34-test-write-the-test-for-diff-between-phen-versions-bug-observed-with-adding-concept' into 'dev'

test: added tests for diff between config 1, 2, 3 in test_diff. Updated the...

Closes #34

See merge request meldb/concepts-processing!17
parents 8479e963 ba8a7ccb
No related branches found
No related tags found
No related merge requests found
...@@ -12,6 +12,7 @@ import logging ...@@ -12,6 +12,7 @@ import logging
import requests import requests
import yaml import yaml
from cerberus import Validator from cerberus import Validator
from deepdiff import DeepDiff
from pathlib import Path from pathlib import Path
from urllib.parse import urlparse, urlunparse from urllib.parse import urlparse, urlunparse
...@@ -152,28 +153,32 @@ def create_empty_git_dir(path): ...@@ -152,28 +153,32 @@ def create_empty_git_dir(path):
keep_path.touch(exist_ok=True) keep_path.touch(exist_ok=True)
def check_delete_dir(path, msg):
deleted = False
user_input = input(f"{msg}").strip().lower()
if user_input in ["yes", "y"]:
shutil.rmtree(path)
deleted = True
else:
logger.info("Directory was not deleted.")
return deleted
def init(phen_dir, remote_url): def init(phen_dir, remote_url):
"""Initial phenotype directory as git repo with standard structure""" """Initial phenotype directory as git repo with standard structure"""
logger.info(f"Initialising Phenotype in directory: {phen_dir}") logger.info(f"Initialising Phenotype in directory: {phen_dir}")
phen_path = Path(phen_dir) phen_path = Path(phen_dir)
# check if directory already exists and ask user if they want to recreate it # check if directory already exists and ask user if they want to recreate it
configure = False
if ( if (
phen_path.exists() and phen_path.is_dir() phen_path.exists() and phen_path.is_dir()
): # Check if it exists and is a directory ): # Check if it exists and is a directory
user_input = ( configure = check_delete_dir(
input( phen_path,
f"The phen directory already exists. Do you want to reinitialise? (yes/no): " f"The phen directory already exists. Do you want to reinitialise? (yes/no): ",
)
.strip()
.lower()
) )
if user_input in ["yes", "y"]:
shutil.rmtree(phen_path)
configure = True
else:
logger.info("Phen directory was not recreated.")
else: else:
configure = True configure = True
...@@ -829,16 +834,22 @@ def copy(phen_dir, target_dir, version): ...@@ -829,16 +834,22 @@ def copy(phen_dir, target_dir, version):
copy_path = target_path / version copy_path = target_path / version
logger.info(f"Copying repo {phen_path} to {copy_path}") logger.info(f"Copying repo {phen_path} to {copy_path}")
if not copy_path.exists(): if (
# If copy directory doesn't exist, clone the repo copy_path.exists() and copy_path.is_dir()
): # Check if it exists and is a directory
copy = check_delete_dir(
copy_path,
f"The directory {str(copy_path.resolve())} already exists. Do you want to overwrite? (yes/no): ",
)
else:
copy = True
if not copy:
logger.info(f"Not copying the version {version}")
return
logger.debug(f"Cloning repo from {phen_path} into {copy_path}...") logger.debug(f"Cloning repo from {phen_path} into {copy_path}...")
repo = git.Repo.clone_from(phen_path, copy_path) repo = git.Repo.clone_from(phen_path, copy_path)
else:
# If copy directory exists, open the repo
logger.debug(
f"Copy of repository already exists in {copy_path}. Opening the repo..."
)
repo = git.Repo(copy_path)
# Check out the latest commit or specified version # Check out the latest commit or specified version
if version: if version:
...@@ -855,27 +866,104 @@ def copy(phen_dir, target_dir, version): ...@@ -855,27 +866,104 @@ def copy(phen_dir, target_dir, version):
logger.info(f"Phenotype copied successfully") logger.info(f"Phenotype copied successfully")
def diff(phen_dir, phen_old_dir): # Convert concept_sets list into dictionaries
"""Compare the differences between two versions of a phenotype""" def extract_concepts(config_data):
"""Extracts concepts as {name: file_path} dictionary and a name set."""
concepts_dict = {
item["name"]: item["file"]["path"]
for item in config_data["phenotype"]["concept_sets"]
}
name_set = set(concepts_dict.keys())
return concepts_dict, name_set
# validate phenotype directories
validate(phen_old_dir)
validate(phen_dir)
old_phen_path = Path(phen_old_dir) def extract_clean_deepdiff_keys(diff, key_type):
new_phen_path = Path(phen_dir) """
Extracts clean keys from a DeepDiff dictionary.
# Load report (FOR SOME REASON THIS WAS APPEND SO SET TO w for NOW) :param diff: DeepDiff result dictionary
report_file_name = old_phen_path.name + "_diff.md" :param key_type: The type of change to extract (e.g., "dictionary_item_added", "dictionary_item_removed")
report_path = new_phen_path / report_file_name :return: A set of clean key names
report = open(report_path, "w") """
logger.debug(f"Writing to report file {str(report_path.resolve())}") return {key.split("root['")[1].split("']")[0] for key in diff.get(key_type, [])}
# Get maps files from phenotype
old_map_path = old_phen_path / MAP_DIR
new_map_path = new_phen_path / MAP_DIR
# List files from output directories def diff_config(old_config, new_config):
report = f"\n# Changes to phenotype configuration\n"
report += f"This compares changes in the phenotype configuration including added, removed and renamed concept sets and changes to concept set source concept code file paths\n\n"
old_concepts, old_names = extract_concepts(old_config)
new_concepts, new_names = extract_concepts(new_config)
# Check added and removed names
added_names = new_names - old_names # Names that appear in new but not in old
removed_names = old_names - new_names # Names that were in old but not in new
# find file path changes for unchanged names
unchanged_names = old_names & new_names # Names that exist in both
file_diff = DeepDiff(
{name: old_concepts[name] for name in unchanged_names},
{name: new_concepts[name] for name in unchanged_names},
)
# Find renamed concepts (same file, different name)
renamed_concepts = []
for removed in removed_names:
old_path = old_concepts[removed]
for added in added_names:
new_path = new_concepts[added]
if old_path == new_path:
renamed_concepts.append((removed, added))
# Remove renamed concepts from added and removed sets
for old_name, new_name in renamed_concepts:
added_names.discard(new_name)
removed_names.discard(old_name)
# generate config report
if added_names:
report += "## Added Concepts\n"
for name in added_names:
report += f"- `{name}` (File: `{new_concepts[name]}`)\n"
report += "\n"
if removed_names:
report += "## Removed Concepts\n"
for name in removed_names:
report += f"- `{name}` (File: `{old_concepts[name]}`)\n"
report += "\n"
if renamed_concepts:
report += "## Renamed Concepts\n"
for old_name, new_name in renamed_concepts:
report += (
f"- `{old_name}` ➝ `{new_name}` (File: `{old_concepts[old_name]}`)\n"
)
report += "\n"
if "values_changed" in file_diff:
report += "## Updated File Paths\n"
for name, change in file_diff["values_changed"].items():
old_file = change["old_value"]
new_file = change["new_value"]
clean_name = name.split("root['")[1].split("']")[0]
report += (
f"- `{clean_name}` changed file from `{old_file}` ➝ `{new_file}`\n"
)
report += "\n"
if not (
added_names
or removed_names
or renamed_concepts
or file_diff.get("values_changed")
):
report += "No changes in concept sets.\n"
return report
def diff_map_files(old_map_path, new_map_path):
old_output_files = [ old_output_files = [
file.name file.name
for file in old_map_path.iterdir() for file in old_map_path.iterdir()
...@@ -898,19 +986,15 @@ def diff(phen_dir, phen_old_dir): ...@@ -898,19 +986,15 @@ def diff(phen_dir, phen_old_dir):
# Outputs that are the intersection of old_output_set and new_output_set # Outputs that are the intersection of old_output_set and new_output_set
common_outputs = old_output_set & new_output_set common_outputs = old_output_set & new_output_set
# Write outputs report report = f"\n# Changes to available translations\n"
new_config = new_phen_path / CONFIG_FILE report += f"This compares the coding translations files available.\n\n"
with new_config.open("r") as file: report += f"- Removed outputs: {sorted(list(removed_outputs))}\n"
new_config = yaml.safe_load(file) report += f"- Added outputs: {sorted(list(added_outputs))}\n"
report.write(f"\n\n# Report for version {new_config['phenotype']['version']}\n\n") report += f"- Common outputs: {sorted(list(common_outputs))}\n\n"
report.write(f"- Removed outputs: {list(removed_outputs)}\n")
report.write(f"- Added outputs: {list(added_outputs)}\n")
report.write(f"- Common outputs: {list(common_outputs)}\n")
report.write( # Step N: Compare common outputs between versions
f"\n\n## Compare concepts {str(old_phen_path.resolve())} to {str(new_phen_path.resolve())}\n\n" report += f"# Changes to concepts in translation files\n\n"
) report += f"This compares the added and removed concepts in each of the coding translation files. Note that this might be different to the config.yaml if the translations have not been run for the current config.\n\n"
# Compare common outputs between versions
for file in common_outputs: for file in common_outputs:
old_output = old_map_path / file old_output = old_map_path / file
new_output = new_map_path / file new_output = new_map_path / file
...@@ -924,12 +1008,11 @@ def diff(phen_dir, phen_old_dir): ...@@ -924,12 +1008,11 @@ def diff(phen_dir, phen_old_dir):
df2 = df2[["CONCEPT", "CONCEPT_SET"]].groupby("CONCEPT_SET").count() df2 = df2[["CONCEPT", "CONCEPT_SET"]].groupby("CONCEPT_SET").count()
# Check for added and removed concepts # Check for added and removed concepts
report.write( report += f"- File {file}\n"
"- Removed concepts {}\n".format(list(set(df1.index) - set(df2.index))) sorted_list = sorted(list(set(df1.index) - set(df2.index)))
) report += f"- Removed concepts {sorted_list}\n"
report.write( sorted_list = sorted(list(set(df2.index) - set(df1.index)))
"- Added concepts {}\n".format(list(set(df2.index) - set(df1.index))) report += f"- Added concepts {sorted_list}\n"
)
# Check for changed concepts # Check for changed concepts
diff = df2 - df1 # diff in counts diff = df2 - df1 # diff in counts
...@@ -940,8 +1023,57 @@ def diff(phen_dir, phen_old_dir): ...@@ -940,8 +1023,57 @@ def diff(phen_dir, phen_old_dir):
if len(diff.index) > 0: if len(diff.index) > 0:
for concept, row in diff.iterrows(): for concept, row in diff.iterrows():
s += "\t - {} {}\n".format(concept, row["CONCEPT"]) s += "\t - {} {}\n".format(concept, row["CONCEPT"])
report.write(f"- Changed concepts {s}\n\n") report += f"- Changed concepts {s}\n\n"
else: else:
report.write(f"- Changed concepts []\n\n") report += f"- Changed concepts []\n\n"
return report
def diff(phen_dir, phen_old_dir):
"""Compare the differences between two versions of a phenotype"""
# validate phenotypes
validate(phen_old_dir)
validate(phen_dir)
# get old and new config
old_phen_path = Path(phen_old_dir)
old_config = old_phen_path / CONFIG_FILE
with old_config.open("r") as file:
old_config = yaml.safe_load(file)
new_phen_path = Path(phen_dir)
new_config = new_phen_path / CONFIG_FILE
with new_config.open("r") as file:
new_config = yaml.safe_load(file)
# write report heading
report = f"# Phenotype Comparison Report\n"
report += f"## Original phenotype\n"
report += f" - {old_config['phenotype']['omop']['vocabulary_id']}\n"
report += f" - {old_config['phenotype']['version']}\n"
report += f" - {str(old_phen_path.resolve())}\n"
report += f"## Changed phenotype:\n"
report += f" - {new_config['phenotype']['omop']['vocabulary_id']}\n"
report += f" - {new_config['phenotype']['version']}\n"
report += f" - {str(new_phen_path.resolve())}\n"
# Step 1: check differences configuration files
# Convert list of dicts into a dict: {name: file}
report += diff_config(old_config, new_config)
# Step 2: check differences between map files
# List files from output directories
old_map_path = old_phen_path / MAP_DIR
new_map_path = new_phen_path / MAP_DIR
report += diff_map_files(old_map_path, new_map_path)
# initialise report file
report_file_name = old_phen_path.name + "_diff.md"
report_path = new_phen_path / report_file_name
logger.debug(f"Writing to report file {str(report_path.resolve())}")
report_file = open(report_path, "w")
report_file.write(report)
report_file.close()
logger.info(f"Phenotypes diff'd successfully") logger.info(f"Phenotypes diff'd successfully")
...@@ -69,6 +69,12 @@ To run tests using `pytest`, use: ...@@ -69,6 +69,12 @@ To run tests using `pytest`, use:
hatch run pytest hatch run pytest
``` ```
To run a specific test and display print outputs `-s`, use
```sh
hatch run pytest -s -v tests/test_acmc.py::<test_function_name>
```
## Running the CLI ## Running the CLI
The package provides a command-line interface (CLI) entry point. The package provides a command-line interface (CLI) entry point.
......
phenotype: phenotype:
version: "v1.0.1" version: "v1.0.1"
omop: omop:
vocabulary_id: "ACMC_Example" vocabulary_id: "ACMC_Example_1"
vocabulary_name: "ACMC example phenotype" vocabulary_name: "ACMC example 1 phenotype"
vocabulary_reference: "https://git.soton.ac.uk/meldb/concepts-processing/-/tree/main/examples" vocabulary_reference: "https://git.soton.ac.uk/meldb/concepts-processing/-/tree/main/examples"
concept_sets: concept_sets:
- name: "ABDO_PAIN" - name: "ABDO_PAIN"
......
phenotype: phenotype:
version: "v1.0.4" version: "v1.0.1"
omop: omop:
vocabulary_id: "ACMC_Example" vocabulary_id: "ACMC_Example_2"
vocabulary_name: "ACMC example phenotype" vocabulary_name: "ACMC example 2 phenotype"
vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example" vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
concept_sets: concept_sets:
- name: "CVD_EVENTS" - name: "CVD_EVENTS"
......
phenotype: phenotype:
version: "v1.0.4" version: "v1.0.1"
omop: omop:
vocabulary_id: "ACMC_Example" vocabulary_id: "ACMC_Example_3"
vocabulary_name: "ACMC example phenotype" vocabulary_name: "ACMC example 3 phenotype"
vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example" vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
concept_sets: concept_sets:
- name: "CVD_EVENTS" - name: "CVD_EVENTS"
......
...@@ -19,6 +19,7 @@ dependencies = [ ...@@ -19,6 +19,7 @@ dependencies = [
"cerberus", "cerberus",
"click", "click",
"cramjam", "cramjam",
"deepdiff",
"et-xmlfile", "et-xmlfile",
"fastparquet", "fastparquet",
"fsspec", "fsspec",
......
...@@ -155,3 +155,168 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file): ...@@ -155,3 +155,168 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file):
) )
main.main() main.main()
assert "Phenotypes diff'd successfully" in caplog.text assert "Phenotypes diff'd successfully" in caplog.text
# TODO: Refactor this test so it is shortened, there's lots of duplicate codes
def test_diff(tmp_dir, monkeypatch, caplog):
print(f"Temporary directory: {tmp_dir}") # Prints path for debugging
# init phenotype
phen_path = tmp_dir / "phen"
phen_path = phen_path.resolve()
monkeypatch.setattr(
sys, "argv", ["main.py", "phen", "init", "-d", str(phen_path.resolve())]
)
# Mock input() to return "yes" to the question about reinitialising the directory
monkeypatch.setattr("builtins.input", lambda _: "y")
main.main()
# copy example codes
shutil.rmtree(phen_path / "codes")
ex_path = Path("./examples").resolve()
for item in ex_path.iterdir():
source = ex_path / item.name
destination = phen_path / item.name
if source.is_dir():
shutil.copytree(source, destination)
else:
shutil.copy(source, destination)
# map phenotype for config 1
shutil.copy(phen_path / "config1.yaml", phen_path / "config.yaml")
for code_type in ["read2"]:
with caplog.at_level(logging.DEBUG):
monkeypatch.setattr(
sys,
"argv",
[
"main.py",
"phen",
"map",
"-d",
str(phen_path.resolve()),
"-t",
code_type,
],
)
main.main()
assert "Phenotype processed successfully" in caplog.text
# publish phenotype
with caplog.at_level(logging.DEBUG):
monkeypatch.setattr(
sys, "argv", ["main.py", "phen", "publish", "-d", str(phen_path.resolve())]
)
main.main()
assert "Phenotype published successfully" in caplog.text
# copy phenotype'
with caplog.at_level(logging.DEBUG):
monkeypatch.setattr(
sys,
"argv",
[
"main.py",
"phen",
"copy",
"-d",
str(phen_path.resolve()),
"-td",
str(tmp_dir.resolve()),
"-v",
"v1.0.3",
],
)
main.main()
assert "Phenotype copied successfully" in caplog.text
# map phenotype example 2
shutil.copy(phen_path / "config2.yaml", phen_path / "config.yaml")
for code_type in ["read2", "read3"]:
with caplog.at_level(logging.DEBUG):
monkeypatch.setattr(
sys,
"argv",
[
"main.py",
"phen",
"map",
"-d",
str(phen_path.resolve()),
"-t",
code_type,
],
)
main.main()
assert "Phenotype processed successfully" in caplog.text
# diff phenotype with v1.0.3
with caplog.at_level(logging.DEBUG):
old_path = tmp_dir / "v1.0.3"
monkeypatch.setattr(
sys,
"argv",
[
"main.py",
"phen",
"diff",
"-d",
str(phen_path.resolve()),
"-old",
str(old_path.resolve()),
],
)
main.main()
assert "Phenotypes diff'd successfully" in caplog.text
# check changes
with open(phen_path / "v1.0.3_diff.md", "r") as file:
content = file.read()
assert "Removed concepts ['ABDO_PAIN']" in content
assert "Added concepts ['DID_NOT_ATTEND']" in content
assert "Added outputs: ['read3.csv']" in content
# map phenotype with example 3
shutil.copy(phen_path / "config3.yaml", phen_path / "config.yaml")
for code_type in ["read2", "read3", "snomed"]:
with caplog.at_level(logging.DEBUG):
monkeypatch.setattr(
sys,
"argv",
[
"main.py",
"phen",
"map",
"-d",
str(phen_path.resolve()),
"-t",
code_type,
],
)
main.main()
assert "Phenotype processed successfully" in caplog.text
# diff phenotype with v1.0.3
with caplog.at_level(logging.DEBUG):
old_path = tmp_dir / "v1.0.3"
monkeypatch.setattr(
sys,
"argv",
[
"main.py",
"phen",
"diff",
"-d",
str(phen_path.resolve()),
"-old",
str(old_path.resolve()),
],
)
main.main()
assert "Phenotypes diff'd successfully" in caplog.text
with open(phen_path / "v1.0.3_diff.md", "r") as file:
content = file.read()
assert "Removed concepts ['ABDO_PAIN']" in content
assert "Added concepts ['DEPRESSION', 'DID_NOT_ATTEND', 'HYPERTENSION']" in content
assert "Added outputs: ['read3.csv', 'snomed.csv']" in content
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment