Skip to content
Snippets Groups Projects
Commit 9eca4250 authored by mjbonifa's avatar mjbonifa
Browse files

refactor: converted all examples to yaml and added test for config2. Updated...

refactor: converted all examples to yaml and added test for config2. Updated readme. Fixed but in diff that assumed all csv files in map directory where map files when some where error files if no codes exists, moved error files to errors directory in the map directory to avoid this.
parent 4f35f9ba
No related branches found
No related tags found
No related merge requests found
......@@ -189,7 +189,7 @@ cp -r ./examples/codes/* ./workspace/phen/codes
From the command prompt, copy example phenotype configuration files `/examples/config.json` to the phenotype directory:
```bash
cp -r ./examples/config.json ./workspace/phen
cp -r ./examples/config1.yaml ./workspace/phen/config.yaml
```
- You can view the configuarion file here [`config.json`](./examples/config.json)
......
......@@ -32,7 +32,7 @@ MAP_DIR = "map"
CONCEPT_SET_DIR = "concept-set"
OMOP_DIR = "omop"
DEFAULT_PHEN_DIR_LIST = [CODES_DIR, MAP_DIR, CONCEPT_SET_DIR, OMOP_DIR]
CONFIG_FILE = "config.json"
CONFIG_FILE = "config.yaml"
VOCAB_VERSION_FILE = "vocab_version.yaml"
DEFAULT_GIT_BRANCH = "main"
......@@ -202,9 +202,8 @@ def init(phen_dir, remote_url):
"codes": [],
}
config_path = phen_path / CONFIG_FILE
with open(config_path, "w", encoding="utf-8") as f:
json.dump(config, f, indent=4)
with open(phen_path / CONFIG_FILE, "w") as file:
yaml.dump(config, file, default_flow_style=False, sort_keys=False)
# add git ignore
ignore_content = """# Ignore SQLite database files
......@@ -233,7 +232,7 @@ def validate(phen_dir):
logger.info(f"Validating phenotype: {phen_dir}")
phen_path = Path(phen_dir)
if not phen_path.is_dir():
raise NotADirectoryError(f"Error: '{phen_path}' is not a directory")
raise NotADirectoryError(f"Error: '{str(phen_path.resolve())}' is not a directory")
config_path = phen_path / CONFIG_FILE
if not config_path.is_file():
......@@ -254,8 +253,9 @@ def validate(phen_dir):
raise Exception(f"Phen directory {phen_path} is not a git repo")
# Load configuration File
if config_path.suffix == ".json":
mapping = json.load(open(config_path, "rb"))
if config_path.suffix == ".yaml":
with config_path.open("r") as file:
mapping = yaml.safe_load(file)
else:
raise Exception(
f"Unsupported configuration filetype: {str(config_path.resolve())}"
......@@ -582,7 +582,9 @@ def map(phen_dir, target_code_type):
codes_path = phen_path / CODES_DIR
# load configuration
config = json.load(open(config_path, "rb"))
with config_path.open("r") as file:
config = yaml.safe_load(file)
concept_sets = config["concept_sets"]
codes = config["codes"]
......@@ -633,6 +635,7 @@ def map(phen_dir, target_code_type):
df = df.groupby(divide_col)
# Map to Concept/Phenotype
# TODO: This code needs refactory as it seems handling of the concept_set_categories should happen at another place
if len(df.index) != 0:
if ("concept_set" in file) and isinstance(df, pd.core.frame.DataFrame):
out = map_file(
......@@ -661,10 +664,10 @@ def map(phen_dir, target_code_type):
concepts=file["concept_set_categories"][cat],
meta_columns=meta_columns,
)
else:
raise AttributeError(
f"File {file} has either no concept_set or conceot_set_categories or the instance of dataframe objectives associated is incorrect, concept_set must be a DataFrame, conceot_set_categories must be pd.core.groupby.generic.DataFrameGroupBy"
)
else:
raise AttributeError(
f"File {file} has either no concept_set or conceot_set_categories or the instance of dataframe objectives associated is incorrect, concept_set must be a DataFrame, conceot_set_categories must be pd.core.groupby.generic.DataFrameGroupBy"
)
else:
logger.warning(
f"File {file} has no output after preprocessing in config {str(config_path.resolve())}"
......@@ -672,8 +675,10 @@ def map(phen_dir, target_code_type):
if len(code_errors) > 0:
logger.error(f"The map processing has {len(code_errors)} errors")
error_filename = f"{target_code_type}-code-errors.csv"
write_code_errors(code_errors, phen_path / MAP_DIR / error_filename)
error_path = phen_path / MAP_DIR / "errors"
error_path.mkdir(parents=True, exist_ok=True)
error_filename = f"{target_code_type}-code-errors.csv"
write_code_errors(code_errors, error_path / error_filename)
# Check there is output from processing
if len(out.index) == 0:
......@@ -681,7 +686,6 @@ def map(phen_dir, target_code_type):
raise Exception(
f"No output after map processing, check config {str(config_path.resolve())}"
)
# Final processing
out = out.reset_index(drop=True)
out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
......@@ -760,7 +764,8 @@ def publish(phen_dir):
# get major version from configuration file
config_path = phen_path / CONFIG_FILE
config = json.load(open(config_path, "rb"))
with config_path.open("r") as file:
config = yaml.safe_load(file)
match = re.match(r"v(\d+\.\d+)", config["concept_sets"]["version"])
major_version = match.group(1)
......@@ -772,8 +777,8 @@ def publish(phen_dir):
version = f"v{major_version}.{next_minor_version}"
logger.debug(f"New version: {version}")
config["concept_sets"]["version"] = version
with open(config_path, "w", encoding="utf-8") as f:
json.dump(config, f, indent=4)
with open(config_path, "w") as file:
yaml.dump(config, file, default_flow_style=False, sort_keys=False)
# Add and commit changes to repo
commit_message = f"Committing updates to phenotype {phen_path}"
......@@ -808,7 +813,8 @@ def export(phen_dir, version):
# load configuration
config_path = phen_path / CONFIG_FILE
config = json.load(open(config_path, "rb"))
with config_path.open("r") as file:
config = yaml.safe_load(file)
map_path = phen_path / MAP_DIR
if not map_path.exists():
......@@ -919,8 +925,9 @@ def diff(phen_dir, phen_old_dir):
common_outputs = old_output_set & new_output_set
# Write outputs report
new_config_path = new_phen_path / CONFIG_FILE
new_config = json.load(open(new_config_path, "rb"))
new_config = new_phen_path / CONFIG_FILE
with new_config.open("r") as file:
new_config = yaml.safe_load(file)
report.write(
f"\n\n# Report for version {new_config['concept_sets']['version']}\n\n"
)
......@@ -936,9 +943,12 @@ def diff(phen_dir, phen_old_dir):
old_output = old_map_path / file
new_output = new_map_path / file
logger.debug(f"Old ouptput: {str(old_output.resolve())}")
logger.debug(f"New ouptput: {str(new_output.resolve())}")
df1 = pd.read_csv(old_output)
df1 = df1[["CONCEPT", "CONCEPT_SET"]].groupby("CONCEPT_SET").count()
df2 = pd.read_csv(new_output)
df2 = pd.read_csv(new_output)
df2 = df2[["CONCEPT", "CONCEPT_SET"]].groupby("CONCEPT_SET").count()
# Check for added and removed concepts
......@@ -963,21 +973,3 @@ def diff(phen_dir, phen_old_dir):
report.write(f"- Changed concepts []\n\n")
logger.info(f"Phenotypes diff'd successfully")
# Here's the atlas code that needs to go into anotehr function
# if output_path == "atlas":
# vocab_id = summary_config["omop"]["vocabulary_id"]
# vocab_version = summary_config["version"]
# vocab_name = summary_config["omop"]["vocabulary_name"]
# vocab_reference = summary_config["omop"]["vocabulary_reference"]
# Create New OMOP Vocabulary
# omop_setup(OMOP_DB_PATH, vocab_id, vocab_version, vocab_name, vo#cab_reference)
# Export to DB
# omop_publish_concept_sets(out,
# OMOP_DB_PATH,
# vocab_id,
# omop_vocab_types[target_code_type],
# vocab_version,)
{
"concept_sets": {
"version": "v1.0.1",
"omop": {
"vocabulary_id": "ACMC_Example",
"vocabulary_name": "ACMC example phenotype",
"vocabulary_reference": "https://git.soton.ac.uk/meldb/concepts-processing/-/tree/main/examples"
},
"concept_set": [
{
"concept_set_name": "ABDO_PAIN",
"concept_set_status": "AGREED",
"metadata": {
}
}
]
},
"codes": [
{
"folder": "clinical-codes-org",
"description": "Downloaded 16/11/23",
"files": [
{
"file": "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv",
"columns": {
"read2": "code",
"metadata": [
"description"
]
},
"concept_set": [
"ABDO_PAIN"
]
}
]
}
]
}
File moved
{
"concept_sets": {
"version": "v1.0.4",
"omop": {
"vocabulary_id": "ACMC_Example",
"vocabulary_name": "ACMC example phenotype",
"vocabulary_reference": "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
},
"concept_set": [
{
"concept_set_name": "CVD_EVENTS",
"concept_set_status": "AGREED",
"metadata": {}
},
{
"concept_set_name": "DID_NOT_ATTEND",
"concept_set_status": "AGREED",
"metadata": {}
}
]
},
"codes": [
{
"folder": "clinical-codes-org",
"description": "Downloaded 16/11/23",
"files": [
{
"file": "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv",
"columns": {
"icd10": "code",
"metadata": []
},
"concept_set": [
"CVD_EVENTS"
]
},
{
"file": "Non-attendance codes/res201-did-not-attend-appointment.csv",
"columns": {
"read2": "code",
"metadata": []
},
"concept_set": [
"DID_NOT_ATTEND"
]
}
]
}
]
}
concept_sets:
version: "v1.0.4"
omop:
vocabulary_id: "ACMC_Example"
vocabulary_name: "ACMC example phenotype"
vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
concept_set:
- concept_set_name: "CVD_EVENTS"
concept_set_status: "AGREED"
metadata: {}
- concept_set_name: "DID_NOT_ATTEND"
concept_set_status: "AGREED"
metadata: {}
codes:
- folder: "clinical-codes-org"
description: "Downloaded 16/11/23"
files:
- file: "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
columns:
icd10: "code"
metadata: []
concept_set:
- "CVD_EVENTS"
- file: "Non-attendance codes/res201-did-not-attend-appointment.csv"
columns:
read2: "code"
metadata: []
concept_set:
- "DID_NOT_ATTEND"
......@@ -44,7 +44,16 @@ def test_phen_init_local_specified(tmp_dir, monkeypatch, caplog):
assert "Phenotype initialised successfully" in caplog.text
def test_phen_workflow(tmp_dir, monkeypatch, caplog):
# TODO: This test will need to be refactored so that the expected outputs match the config files
# right now it just tests that it runs successfully and does not check the contents of the output
@pytest.mark.parametrize("config_file", [
("config1.yaml"), # config.yaml test case
("config2.yaml"), # config.yaml test case
])
def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file):
print(f"Temporary directory: {tmp_dir}") # Prints path for debugging
with caplog.at_level(logging.DEBUG):
phen_path = tmp_dir / "phen"
phen_path = phen_path.resolve()
......@@ -69,6 +78,9 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog):
else:
shutil.copy(source, destination)
# copy the test file to configuration
shutil.copy(phen_path / config_file, phen_path / "config.yaml")
monkeypatch.setattr(
sys, "argv", ["main.py", "phen", "validate", "-d", str(phen_path.resolve())]
)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment