diff --git a/README.md b/README.md index 11254581fb32a020451e73893fa92dc850c5cc51..5dfb6cce5e92e99ced354cc462488fb9bfd6934d 100644 --- a/README.md +++ b/README.md @@ -189,7 +189,7 @@ cp -r ./examples/codes/* ./workspace/phen/codes From the command prompt, copy example phenotype configuration files `/examples/config.json` to the phenotype directory: ```bash -cp -r ./examples/config.json ./workspace/phen +cp -r ./examples/config1.yaml ./workspace/phen/config.yaml ``` - You can view the configuarion file here [`config.json`](./examples/config.json) diff --git a/acmc/phen.py b/acmc/phen.py index dd2b6ff5017e922257b08f3d4e7ba0414e4b8836..14dc58348d5116658aa2eb908b411524ef01185b 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -32,7 +32,7 @@ MAP_DIR = "map" CONCEPT_SET_DIR = "concept-set" OMOP_DIR = "omop" DEFAULT_PHEN_DIR_LIST = [CODES_DIR, MAP_DIR, CONCEPT_SET_DIR, OMOP_DIR] -CONFIG_FILE = "config.json" +CONFIG_FILE = "config.yaml" VOCAB_VERSION_FILE = "vocab_version.yaml" DEFAULT_GIT_BRANCH = "main" @@ -202,9 +202,8 @@ def init(phen_dir, remote_url): "codes": [], } - config_path = phen_path / CONFIG_FILE - with open(config_path, "w", encoding="utf-8") as f: - json.dump(config, f, indent=4) + with open(phen_path / CONFIG_FILE, "w") as file: + yaml.dump(config, file, default_flow_style=False, sort_keys=False) # add git ignore ignore_content = """# Ignore SQLite database files @@ -233,7 +232,7 @@ def validate(phen_dir): logger.info(f"Validating phenotype: {phen_dir}") phen_path = Path(phen_dir) if not phen_path.is_dir(): - raise NotADirectoryError(f"Error: '{phen_path}' is not a directory") + raise NotADirectoryError(f"Error: '{str(phen_path.resolve())}' is not a directory") config_path = phen_path / CONFIG_FILE if not config_path.is_file(): @@ -254,8 +253,9 @@ def validate(phen_dir): raise Exception(f"Phen directory {phen_path} is not a git repo") # Load configuration File - if config_path.suffix == ".json": - mapping = json.load(open(config_path, "rb")) + if config_path.suffix == ".yaml": + with config_path.open("r") as file: + mapping = yaml.safe_load(file) else: raise Exception( f"Unsupported configuration filetype: {str(config_path.resolve())}" @@ -582,7 +582,9 @@ def map(phen_dir, target_code_type): codes_path = phen_path / CODES_DIR # load configuration - config = json.load(open(config_path, "rb")) + with config_path.open("r") as file: + config = yaml.safe_load(file) + concept_sets = config["concept_sets"] codes = config["codes"] @@ -633,6 +635,7 @@ def map(phen_dir, target_code_type): df = df.groupby(divide_col) # Map to Concept/Phenotype + # TODO: This code needs refactory as it seems handling of the concept_set_categories should happen at another place if len(df.index) != 0: if ("concept_set" in file) and isinstance(df, pd.core.frame.DataFrame): out = map_file( @@ -661,10 +664,10 @@ def map(phen_dir, target_code_type): concepts=file["concept_set_categories"][cat], meta_columns=meta_columns, ) - else: - raise AttributeError( - f"File {file} has either no concept_set or conceot_set_categories or the instance of dataframe objectives associated is incorrect, concept_set must be a DataFrame, conceot_set_categories must be pd.core.groupby.generic.DataFrameGroupBy" - ) + else: + raise AttributeError( + f"File {file} has either no concept_set or conceot_set_categories or the instance of dataframe objectives associated is incorrect, concept_set must be a DataFrame, conceot_set_categories must be pd.core.groupby.generic.DataFrameGroupBy" + ) else: logger.warning( f"File {file} has no output after preprocessing in config {str(config_path.resolve())}" @@ -672,8 +675,10 @@ def map(phen_dir, target_code_type): if len(code_errors) > 0: logger.error(f"The map processing has {len(code_errors)} errors") - error_filename = f"{target_code_type}-code-errors.csv" - write_code_errors(code_errors, phen_path / MAP_DIR / error_filename) + error_path = phen_path / MAP_DIR / "errors" + error_path.mkdir(parents=True, exist_ok=True) + error_filename = f"{target_code_type}-code-errors.csv" + write_code_errors(code_errors, error_path / error_filename) # Check there is output from processing if len(out.index) == 0: @@ -681,7 +686,6 @@ def map(phen_dir, target_code_type): raise Exception( f"No output after map processing, check config {str(config_path.resolve())}" ) - # Final processing out = out.reset_index(drop=True) out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"]) @@ -760,7 +764,8 @@ def publish(phen_dir): # get major version from configuration file config_path = phen_path / CONFIG_FILE - config = json.load(open(config_path, "rb")) + with config_path.open("r") as file: + config = yaml.safe_load(file) match = re.match(r"v(\d+\.\d+)", config["concept_sets"]["version"]) major_version = match.group(1) @@ -772,8 +777,8 @@ def publish(phen_dir): version = f"v{major_version}.{next_minor_version}" logger.debug(f"New version: {version}") config["concept_sets"]["version"] = version - with open(config_path, "w", encoding="utf-8") as f: - json.dump(config, f, indent=4) + with open(config_path, "w") as file: + yaml.dump(config, file, default_flow_style=False, sort_keys=False) # Add and commit changes to repo commit_message = f"Committing updates to phenotype {phen_path}" @@ -808,7 +813,8 @@ def export(phen_dir, version): # load configuration config_path = phen_path / CONFIG_FILE - config = json.load(open(config_path, "rb")) + with config_path.open("r") as file: + config = yaml.safe_load(file) map_path = phen_path / MAP_DIR if not map_path.exists(): @@ -919,8 +925,9 @@ def diff(phen_dir, phen_old_dir): common_outputs = old_output_set & new_output_set # Write outputs report - new_config_path = new_phen_path / CONFIG_FILE - new_config = json.load(open(new_config_path, "rb")) + new_config = new_phen_path / CONFIG_FILE + with new_config.open("r") as file: + new_config = yaml.safe_load(file) report.write( f"\n\n# Report for version {new_config['concept_sets']['version']}\n\n" ) @@ -936,9 +943,12 @@ def diff(phen_dir, phen_old_dir): old_output = old_map_path / file new_output = new_map_path / file + logger.debug(f"Old ouptput: {str(old_output.resolve())}") + logger.debug(f"New ouptput: {str(new_output.resolve())}") + df1 = pd.read_csv(old_output) df1 = df1[["CONCEPT", "CONCEPT_SET"]].groupby("CONCEPT_SET").count() - df2 = pd.read_csv(new_output) + df2 = pd.read_csv(new_output) df2 = df2[["CONCEPT", "CONCEPT_SET"]].groupby("CONCEPT_SET").count() # Check for added and removed concepts @@ -963,21 +973,3 @@ def diff(phen_dir, phen_old_dir): report.write(f"- Changed concepts []\n\n") logger.info(f"Phenotypes diff'd successfully") - - -# Here's the atlas code that needs to go into anotehr function -# if output_path == "atlas": -# vocab_id = summary_config["omop"]["vocabulary_id"] -# vocab_version = summary_config["version"] -# vocab_name = summary_config["omop"]["vocabulary_name"] -# vocab_reference = summary_config["omop"]["vocabulary_reference"] - -# Create New OMOP Vocabulary -# omop_setup(OMOP_DB_PATH, vocab_id, vocab_version, vocab_name, vo#cab_reference) - -# Export to DB -# omop_publish_concept_sets(out, -# OMOP_DB_PATH, -# vocab_id, -# omop_vocab_types[target_code_type], -# vocab_version,) diff --git a/examples/config.json b/examples/config.json deleted file mode 100644 index 2acfeae968ed0abe9330a98f763247e8bb1fbaaa..0000000000000000000000000000000000000000 --- a/examples/config.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "concept_sets": { - "version": "v1.0.1", - "omop": { - "vocabulary_id": "ACMC_Example", - "vocabulary_name": "ACMC example phenotype", - "vocabulary_reference": "https://git.soton.ac.uk/meldb/concepts-processing/-/tree/main/examples" - }, - "concept_set": [ - { - "concept_set_name": "ABDO_PAIN", - "concept_set_status": "AGREED", - "metadata": { - } - } - ] - }, - "codes": [ - { - "folder": "clinical-codes-org", - "description": "Downloaded 16/11/23", - "files": [ - { - "file": "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv", - "columns": { - "read2": "code", - "metadata": [ - "description" - ] - }, - "concept_set": [ - "ABDO_PAIN" - ] - } - ] - } - ] -} diff --git a/examples/config.yaml b/examples/config1.yaml similarity index 100% rename from examples/config.yaml rename to examples/config1.yaml diff --git a/examples/config2.json b/examples/config2.json deleted file mode 100644 index 42578cffdf4bbb176646406fcd596f806b507639..0000000000000000000000000000000000000000 --- a/examples/config2.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "concept_sets": { - "version": "v1.0.4", - "omop": { - "vocabulary_id": "ACMC_Example", - "vocabulary_name": "ACMC example phenotype", - "vocabulary_reference": "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example" - }, - "concept_set": [ - { - "concept_set_name": "CVD_EVENTS", - "concept_set_status": "AGREED", - "metadata": {} - }, - { - "concept_set_name": "DID_NOT_ATTEND", - "concept_set_status": "AGREED", - "metadata": {} - } - ] - }, - "codes": [ - { - "folder": "clinical-codes-org", - "description": "Downloaded 16/11/23", - "files": [ - { - "file": "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv", - "columns": { - "icd10": "code", - "metadata": [] - }, - "concept_set": [ - "CVD_EVENTS" - ] - }, - { - "file": "Non-attendance codes/res201-did-not-attend-appointment.csv", - "columns": { - "read2": "code", - "metadata": [] - }, - "concept_set": [ - "DID_NOT_ATTEND" - ] - } - ] - } - ] -} diff --git a/examples/config2.yaml b/examples/config2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ee36425dfd69776fd450857132169c11f9f3dae4 --- /dev/null +++ b/examples/config2.yaml @@ -0,0 +1,31 @@ +concept_sets: + version: "v1.0.4" + omop: + vocabulary_id: "ACMC_Example" + vocabulary_name: "ACMC example phenotype" + vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example" + concept_set: + - concept_set_name: "CVD_EVENTS" + concept_set_status: "AGREED" + metadata: {} + - concept_set_name: "DID_NOT_ATTEND" + concept_set_status: "AGREED" + metadata: {} + +codes: + - folder: "clinical-codes-org" + description: "Downloaded 16/11/23" + files: + - file: "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv" + columns: + icd10: "code" + metadata: [] + concept_set: + - "CVD_EVENTS" + - file: "Non-attendance codes/res201-did-not-attend-appointment.csv" + columns: + read2: "code" + metadata: [] + concept_set: + - "DID_NOT_ATTEND" + diff --git a/tests/test_acmc.py b/tests/test_acmc.py index ecdadaefdea9fa514b6d6d7797cebdf5868f3f17..a89f0e6e5c68757e56c60aaac1f3609f9bcb28f9 100644 --- a/tests/test_acmc.py +++ b/tests/test_acmc.py @@ -44,7 +44,16 @@ def test_phen_init_local_specified(tmp_dir, monkeypatch, caplog): assert "Phenotype initialised successfully" in caplog.text -def test_phen_workflow(tmp_dir, monkeypatch, caplog): +# TODO: This test will need to be refactored so that the expected outputs match the config files +# right now it just tests that it runs successfully and does not check the contents of the output +@pytest.mark.parametrize("config_file", [ + ("config1.yaml"), # config.yaml test case + ("config2.yaml"), # config.yaml test case + +]) +def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file): + print(f"Temporary directory: {tmp_dir}") # Prints path for debugging + with caplog.at_level(logging.DEBUG): phen_path = tmp_dir / "phen" phen_path = phen_path.resolve() @@ -69,6 +78,9 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog): else: shutil.copy(source, destination) + # copy the test file to configuration + shutil.copy(phen_path / config_file, phen_path / "config.yaml") + monkeypatch.setattr( sys, "argv", ["main.py", "phen", "validate", "-d", str(phen_path.resolve())] )