Skip to content
Snippets Groups Projects
Commit 55bf62f0 authored by mjbonifa's avatar mjbonifa
Browse files

Merge branch '19-convert-json-config-to-yaml-as-human-readable' into 'dev'

refactor: converted all examples to yaml and added test for config2. Updated...

Closes #19

See merge request meldb/concepts-processing!7
parents 4b1a2f55 78e6fcd6
No related branches found
No related tags found
No related merge requests found
...@@ -189,7 +189,7 @@ cp -r ./examples/codes/* ./workspace/phen/codes ...@@ -189,7 +189,7 @@ cp -r ./examples/codes/* ./workspace/phen/codes
From the command prompt, copy example phenotype configuration files `/examples/config.json` to the phenotype directory: From the command prompt, copy example phenotype configuration files `/examples/config.json` to the phenotype directory:
```bash ```bash
cp -r ./examples/config.json ./workspace/phen cp -r ./examples/config1.yaml ./workspace/phen/config.yaml
``` ```
- You can view the configuarion file here [`config.json`](./examples/config.json) - You can view the configuarion file here [`config.json`](./examples/config.json)
......
...@@ -32,7 +32,7 @@ MAP_DIR = "map" ...@@ -32,7 +32,7 @@ MAP_DIR = "map"
CONCEPT_SET_DIR = "concept-set" CONCEPT_SET_DIR = "concept-set"
OMOP_DIR = "omop" OMOP_DIR = "omop"
DEFAULT_PHEN_DIR_LIST = [CODES_DIR, MAP_DIR, CONCEPT_SET_DIR, OMOP_DIR] DEFAULT_PHEN_DIR_LIST = [CODES_DIR, MAP_DIR, CONCEPT_SET_DIR, OMOP_DIR]
CONFIG_FILE = "config.json" CONFIG_FILE = "config.yaml"
VOCAB_VERSION_FILE = "vocab_version.yaml" VOCAB_VERSION_FILE = "vocab_version.yaml"
DEFAULT_GIT_BRANCH = "main" DEFAULT_GIT_BRANCH = "main"
...@@ -202,9 +202,8 @@ def init(phen_dir, remote_url): ...@@ -202,9 +202,8 @@ def init(phen_dir, remote_url):
"codes": [], "codes": [],
} }
config_path = phen_path / CONFIG_FILE with open(phen_path / CONFIG_FILE, "w") as file:
with open(config_path, "w", encoding="utf-8") as f: yaml.dump(config, file, default_flow_style=False, sort_keys=False)
json.dump(config, f, indent=4)
# add git ignore # add git ignore
ignore_content = """# Ignore SQLite database files ignore_content = """# Ignore SQLite database files
...@@ -233,7 +232,9 @@ def validate(phen_dir): ...@@ -233,7 +232,9 @@ def validate(phen_dir):
logger.info(f"Validating phenotype: {phen_dir}") logger.info(f"Validating phenotype: {phen_dir}")
phen_path = Path(phen_dir) phen_path = Path(phen_dir)
if not phen_path.is_dir(): if not phen_path.is_dir():
raise NotADirectoryError(f"Error: '{phen_path}' is not a directory") raise NotADirectoryError(
f"Error: '{str(phen_path.resolve())}' is not a directory"
)
config_path = phen_path / CONFIG_FILE config_path = phen_path / CONFIG_FILE
if not config_path.is_file(): if not config_path.is_file():
...@@ -254,8 +255,9 @@ def validate(phen_dir): ...@@ -254,8 +255,9 @@ def validate(phen_dir):
raise Exception(f"Phen directory {phen_path} is not a git repo") raise Exception(f"Phen directory {phen_path} is not a git repo")
# Load configuration File # Load configuration File
if config_path.suffix == ".json": if config_path.suffix == ".yaml":
mapping = json.load(open(config_path, "rb")) with config_path.open("r") as file:
mapping = yaml.safe_load(file)
else: else:
raise Exception( raise Exception(
f"Unsupported configuration filetype: {str(config_path.resolve())}" f"Unsupported configuration filetype: {str(config_path.resolve())}"
...@@ -582,7 +584,9 @@ def map(phen_dir, target_code_type): ...@@ -582,7 +584,9 @@ def map(phen_dir, target_code_type):
codes_path = phen_path / CODES_DIR codes_path = phen_path / CODES_DIR
# load configuration # load configuration
config = json.load(open(config_path, "rb")) with config_path.open("r") as file:
config = yaml.safe_load(file)
concept_sets = config["concept_sets"] concept_sets = config["concept_sets"]
codes = config["codes"] codes = config["codes"]
...@@ -633,6 +637,7 @@ def map(phen_dir, target_code_type): ...@@ -633,6 +637,7 @@ def map(phen_dir, target_code_type):
df = df.groupby(divide_col) df = df.groupby(divide_col)
# Map to Concept/Phenotype # Map to Concept/Phenotype
# TODO: This code needs refactory as it seems handling of the concept_set_categories should happen at another place
if len(df.index) != 0: if len(df.index) != 0:
if ("concept_set" in file) and isinstance(df, pd.core.frame.DataFrame): if ("concept_set" in file) and isinstance(df, pd.core.frame.DataFrame):
out = map_file( out = map_file(
...@@ -661,10 +666,10 @@ def map(phen_dir, target_code_type): ...@@ -661,10 +666,10 @@ def map(phen_dir, target_code_type):
concepts=file["concept_set_categories"][cat], concepts=file["concept_set_categories"][cat],
meta_columns=meta_columns, meta_columns=meta_columns,
) )
else: else:
raise AttributeError( raise AttributeError(
f"File {file} has either no concept_set or conceot_set_categories or the instance of dataframe objectives associated is incorrect, concept_set must be a DataFrame, conceot_set_categories must be pd.core.groupby.generic.DataFrameGroupBy" f"File {file} has either no concept_set or conceot_set_categories or the instance of dataframe objectives associated is incorrect, concept_set must be a DataFrame, conceot_set_categories must be pd.core.groupby.generic.DataFrameGroupBy"
) )
else: else:
logger.warning( logger.warning(
f"File {file} has no output after preprocessing in config {str(config_path.resolve())}" f"File {file} has no output after preprocessing in config {str(config_path.resolve())}"
...@@ -672,8 +677,10 @@ def map(phen_dir, target_code_type): ...@@ -672,8 +677,10 @@ def map(phen_dir, target_code_type):
if len(code_errors) > 0: if len(code_errors) > 0:
logger.error(f"The map processing has {len(code_errors)} errors") logger.error(f"The map processing has {len(code_errors)} errors")
error_path = phen_path / MAP_DIR / "errors"
error_path.mkdir(parents=True, exist_ok=True)
error_filename = f"{target_code_type}-code-errors.csv" error_filename = f"{target_code_type}-code-errors.csv"
write_code_errors(code_errors, phen_path / MAP_DIR / error_filename) write_code_errors(code_errors, error_path / error_filename)
# Check there is output from processing # Check there is output from processing
if len(out.index) == 0: if len(out.index) == 0:
...@@ -681,7 +688,6 @@ def map(phen_dir, target_code_type): ...@@ -681,7 +688,6 @@ def map(phen_dir, target_code_type):
raise Exception( raise Exception(
f"No output after map processing, check config {str(config_path.resolve())}" f"No output after map processing, check config {str(config_path.resolve())}"
) )
# Final processing # Final processing
out = out.reset_index(drop=True) out = out.reset_index(drop=True)
out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"]) out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
...@@ -760,7 +766,8 @@ def publish(phen_dir): ...@@ -760,7 +766,8 @@ def publish(phen_dir):
# get major version from configuration file # get major version from configuration file
config_path = phen_path / CONFIG_FILE config_path = phen_path / CONFIG_FILE
config = json.load(open(config_path, "rb")) with config_path.open("r") as file:
config = yaml.safe_load(file)
match = re.match(r"v(\d+\.\d+)", config["concept_sets"]["version"]) match = re.match(r"v(\d+\.\d+)", config["concept_sets"]["version"])
major_version = match.group(1) major_version = match.group(1)
...@@ -772,8 +779,8 @@ def publish(phen_dir): ...@@ -772,8 +779,8 @@ def publish(phen_dir):
version = f"v{major_version}.{next_minor_version}" version = f"v{major_version}.{next_minor_version}"
logger.debug(f"New version: {version}") logger.debug(f"New version: {version}")
config["concept_sets"]["version"] = version config["concept_sets"]["version"] = version
with open(config_path, "w", encoding="utf-8") as f: with open(config_path, "w") as file:
json.dump(config, f, indent=4) yaml.dump(config, file, default_flow_style=False, sort_keys=False)
# Add and commit changes to repo # Add and commit changes to repo
commit_message = f"Committing updates to phenotype {phen_path}" commit_message = f"Committing updates to phenotype {phen_path}"
...@@ -808,7 +815,8 @@ def export(phen_dir, version): ...@@ -808,7 +815,8 @@ def export(phen_dir, version):
# load configuration # load configuration
config_path = phen_path / CONFIG_FILE config_path = phen_path / CONFIG_FILE
config = json.load(open(config_path, "rb")) with config_path.open("r") as file:
config = yaml.safe_load(file)
map_path = phen_path / MAP_DIR map_path = phen_path / MAP_DIR
if not map_path.exists(): if not map_path.exists():
...@@ -919,8 +927,9 @@ def diff(phen_dir, phen_old_dir): ...@@ -919,8 +927,9 @@ def diff(phen_dir, phen_old_dir):
common_outputs = old_output_set & new_output_set common_outputs = old_output_set & new_output_set
# Write outputs report # Write outputs report
new_config_path = new_phen_path / CONFIG_FILE new_config = new_phen_path / CONFIG_FILE
new_config = json.load(open(new_config_path, "rb")) with new_config.open("r") as file:
new_config = yaml.safe_load(file)
report.write( report.write(
f"\n\n# Report for version {new_config['concept_sets']['version']}\n\n" f"\n\n# Report for version {new_config['concept_sets']['version']}\n\n"
) )
...@@ -936,6 +945,9 @@ def diff(phen_dir, phen_old_dir): ...@@ -936,6 +945,9 @@ def diff(phen_dir, phen_old_dir):
old_output = old_map_path / file old_output = old_map_path / file
new_output = new_map_path / file new_output = new_map_path / file
logger.debug(f"Old ouptput: {str(old_output.resolve())}")
logger.debug(f"New ouptput: {str(new_output.resolve())}")
df1 = pd.read_csv(old_output) df1 = pd.read_csv(old_output)
df1 = df1[["CONCEPT", "CONCEPT_SET"]].groupby("CONCEPT_SET").count() df1 = df1[["CONCEPT", "CONCEPT_SET"]].groupby("CONCEPT_SET").count()
df2 = pd.read_csv(new_output) df2 = pd.read_csv(new_output)
...@@ -963,21 +975,3 @@ def diff(phen_dir, phen_old_dir): ...@@ -963,21 +975,3 @@ def diff(phen_dir, phen_old_dir):
report.write(f"- Changed concepts []\n\n") report.write(f"- Changed concepts []\n\n")
logger.info(f"Phenotypes diff'd successfully") logger.info(f"Phenotypes diff'd successfully")
# Here's the atlas code that needs to go into anotehr function
# if output_path == "atlas":
# vocab_id = summary_config["omop"]["vocabulary_id"]
# vocab_version = summary_config["version"]
# vocab_name = summary_config["omop"]["vocabulary_name"]
# vocab_reference = summary_config["omop"]["vocabulary_reference"]
# Create New OMOP Vocabulary
# omop_setup(OMOP_DB_PATH, vocab_id, vocab_version, vocab_name, vo#cab_reference)
# Export to DB
# omop_publish_concept_sets(out,
# OMOP_DB_PATH,
# vocab_id,
# omop_vocab_types[target_code_type],
# vocab_version,)
{
"concept_sets": {
"version": "v1.0.1",
"omop": {
"vocabulary_id": "ACMC_Example",
"vocabulary_name": "ACMC example phenotype",
"vocabulary_reference": "https://git.soton.ac.uk/meldb/concepts-processing/-/tree/main/examples"
},
"concept_set": [
{
"concept_set_name": "ABDO_PAIN",
"concept_set_status": "AGREED",
"metadata": {
}
}
]
},
"codes": [
{
"folder": "clinical-codes-org",
"description": "Downloaded 16/11/23",
"files": [
{
"file": "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv",
"columns": {
"read2": "code",
"metadata": [
"description"
]
},
"concept_set": [
"ABDO_PAIN"
]
}
]
}
]
}
concept_sets:
version: "v1.0.1"
omop:
vocabulary_id: "ACMC_Example"
vocabulary_name: "ACMC example phenotype"
vocabulary_reference: "https://git.soton.ac.uk/meldb/concepts-processing/-/tree/main/examples"
concept_set:
- concept_set_name: "ABDO_PAIN"
concept_set_status: "AGREED"
metadata: {}
codes:
- folder: "clinical-codes-org"
description: "Downloaded 16/11/23"
files:
- file: "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv"
columns:
read2: "code"
metadata:
- "description"
concept_set:
- "ABDO_PAIN"
{
"concept_sets": {
"version": "v1.0.4",
"omop": {
"vocabulary_id": "ACMC_Example",
"vocabulary_name": "ACMC example phenotype",
"vocabulary_reference": "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
},
"concept_set": [
{
"concept_set_name": "CVD_EVENTS",
"concept_set_status": "AGREED",
"metadata": {}
},
{
"concept_set_name": "DID_NOT_ATTEND",
"concept_set_status": "AGREED",
"metadata": {}
}
]
},
"codes": [
{
"folder": "clinical-codes-org",
"description": "Downloaded 16/11/23",
"files": [
{
"file": "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv",
"columns": {
"icd10": "code",
"metadata": []
},
"concept_set": [
"CVD_EVENTS"
]
},
{
"file": "Non-attendance codes/res201-did-not-attend-appointment.csv",
"columns": {
"read2": "code",
"metadata": []
},
"concept_set": [
"DID_NOT_ATTEND"
]
}
]
}
]
}
concept_sets:
version: "v1.0.4"
omop:
vocabulary_id: "ACMC_Example"
vocabulary_name: "ACMC example phenotype"
vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
concept_set:
- concept_set_name: "CVD_EVENTS"
concept_set_status: "AGREED"
metadata: {}
- concept_set_name: "DID_NOT_ATTEND"
concept_set_status: "AGREED"
metadata: {}
codes:
- folder: "clinical-codes-org"
description: "Downloaded 16/11/23"
files:
- file: "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
columns:
icd10: "code"
metadata: []
concept_set:
- "CVD_EVENTS"
- file: "Non-attendance codes/res201-did-not-attend-appointment.csv"
columns:
read2: "code"
metadata: []
concept_set:
- "DID_NOT_ATTEND"
...@@ -44,7 +44,18 @@ def test_phen_init_local_specified(tmp_dir, monkeypatch, caplog): ...@@ -44,7 +44,18 @@ def test_phen_init_local_specified(tmp_dir, monkeypatch, caplog):
assert "Phenotype initialised successfully" in caplog.text assert "Phenotype initialised successfully" in caplog.text
def test_phen_workflow(tmp_dir, monkeypatch, caplog): # TODO: This test will need to be refactored so that the expected outputs match the config files
# right now it just tests that it runs successfully and does not check the contents of the output
@pytest.mark.parametrize(
"config_file",
[
("config1.yaml"), # config.yaml test case
("config2.yaml"), # config.yaml test case
],
)
def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file):
print(f"Temporary directory: {tmp_dir}") # Prints path for debugging
with caplog.at_level(logging.DEBUG): with caplog.at_level(logging.DEBUG):
phen_path = tmp_dir / "phen" phen_path = tmp_dir / "phen"
phen_path = phen_path.resolve() phen_path = phen_path.resolve()
...@@ -69,6 +80,9 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog): ...@@ -69,6 +80,9 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog):
else: else:
shutil.copy(source, destination) shutil.copy(source, destination)
# copy the test file to configuration
shutil.copy(phen_path / config_file, phen_path / "config.yaml")
monkeypatch.setattr( monkeypatch.setattr(
sys, "argv", ["main.py", "phen", "validate", "-d", str(phen_path.resolve())] sys, "argv", ["main.py", "phen", "validate", "-d", str(phen_path.resolve())]
) )
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment