From e549bf4bc63d56a6abda6a0feac674dcd3498ad0 Mon Sep 17 00:00:00 2001 From: Michael Boniface <m.j.boniface@soton.ac.uk> Date: Wed, 26 Feb 2025 19:59:56 +0000 Subject: [PATCH] fix: added map definition to the config.yaml so that we know which maps are expected for the phenotype. It is still possible to run one of them using the -t option but they must be specified the config file. This means a user can run acmc phen map and all the required codes are created. It also reduces the chance of inconsistency between the map files generated between versions. It does not remove it entirely because it is still possible for a user to only run with a subset of the coding types but that should be discouraged. We retain the option for phenottype development because you might not want to run everything all of the time due the time it takes. Closes #40. --- README.md | 2 +- acmc/main.py | 15 +++------------ acmc/omop.py | 4 ++-- acmc/parse.py | 3 +++ acmc/phen.py | 35 ++++++++++++++++++++++++++++++++--- docs/usage.md | 3 +-- examples/config1.yaml | 3 +++ examples/config2.yaml | 3 +++ examples/config3.yaml | 4 ++++ tests/test_acmc.py | 2 +- 10 files changed, 53 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 5dfb6cc..2175c98 100644 --- a/README.md +++ b/README.md @@ -215,7 +215,7 @@ Expected Output: Use the following `acmc` command to generate the phenotype in `read2` format: ```bash -acmc phen map -t read2 +acmc phen map ``` Expected Output: diff --git a/acmc/main.py b/acmc/main.py index 89df65d..b702518 100644 --- a/acmc/main.py +++ b/acmc/main.py @@ -3,7 +3,7 @@ import logging from pathlib import Path import acmc -from acmc import trud, omop, phen, logging_config as lc +from acmc import trud, omop, phen, parse, logging_config as lc # setup logging logger = lc.setup_logger() @@ -167,17 +167,8 @@ def main(): phen_map_parser.add_argument( "-t", "--target-coding", - required=True, - choices=["read2", "read3", "icd10", "snomed", "opcs4"], - help="Specify the target coding (read2, read3, icd10, snomed, opcs4)", - ) - phen_map_parser.add_argument( - "-o", - "--output", - choices=["csv", "omop"], - nargs="+", # allows one or more values - default=["csv"], # default to CSV if not specified - help="Specify output format(s): 'csv', 'omop', or both (default: csv)", + choices=parse.SUPPORTED_CODE_TYPES, + help=f"Specify the target coding {parse.SUPPORTED_CODE_TYPES}", ) phen_map_parser.set_defaults(func=phen_map) diff --git a/acmc/omop.py b/acmc/omop.py index 4b8cc4c..dd5a461 100644 --- a/acmc/omop.py +++ b/acmc/omop.py @@ -16,7 +16,7 @@ logger = logging_config.setup_logger() # constants VOCAB_PATH = Path("./vocab/omop") -OMOP_CDM_Version ="54" +OMOP_CDM_Version = "54" OMOP_DB_FILENAME = f"omop_{OMOP_CDM_Version}.sqlite" DB_PATH = VOCAB_PATH / OMOP_DB_FILENAME VERSION_FILE = "omop_version.yaml" @@ -26,7 +26,7 @@ EXPORT_FILE = "omop_export.db" vocabularies = { "source": "OHDSI Athena", "url": "https://athena.ohdsi.org/vocabulary/list", - "cdm_version": OMOP_CDM_Version, + "cdm_version": OMOP_CDM_Version, "version": "", "vocabularies": [ {"id": 1, "name": "SNOMED"}, # No license required diff --git a/acmc/parse.py b/acmc/parse.py index ca5b3e1..798a10b 100644 --- a/acmc/parse.py +++ b/acmc/parse.py @@ -8,6 +8,9 @@ from acmc import trud, logging_config as lc # setup logging logger = lc.setup_logger() +# Define allowed values +SUPPORTED_CODE_TYPES = {"read2", "read3", "icd10", "snomed", "opcs4", "atc"} + class CodesError: """A class used in InvalidCodesException to report an error if a code parser check fails""" diff --git a/acmc/phen.py b/acmc/phen.py index a92bdb2..a9131b5 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -70,6 +70,15 @@ CONFIG_SCHEMA = { }, }, }, + "map": { + "type": "list", + "schema": { + "type": "string", + "allowed": list( + parse.SUPPORTED_CODE_TYPES + ), # Ensure only predefined values are allowed + }, + }, "concept_sets": { "type": "list", "required": True, @@ -260,6 +269,7 @@ def init(phen_dir, remote_url): "vocabulary_name": "", "vocabulary_reference": "", }, + "translate": [], "concept_sets": [], } } @@ -603,7 +613,6 @@ def write_vocab_version(phen_path): def map(phen_dir, target_code_type): logger.info(f"Processing phenotype: {phen_dir}") - logger.debug(f"Target coding format: {target_code_type}") # Validate configuration validate(phen_dir) @@ -611,13 +620,33 @@ def map(phen_dir, target_code_type): # initialise paths phen_path = Path(phen_dir) config_path = phen_path / CONFIG_FILE - codes_path = phen_path / CODES_DIR # load configuration with config_path.open("r") as file: config = yaml.safe_load(file) phenotype = config["phenotype"] + if len(phenotype["map"]) == 0: + raise ValueError(f"No map codes defined in the phenotype configuration") + + if target_code_type is not None and target_code_type not in phenotype["map"]: + raise ValueError( + f"Target code type {target_code_type} not in phenotype configuration map {phenotype['map']}" + ) + + if target_code_type is not None: + map_target_code_type(phen_path, phenotype, target_code_type) + else: + for t in phenotype["map"]: + map_target_code_type(phen_path, phenotype, t) + + logger.info(f"Phenotype processed successfully") + + +def map_target_code_type(phen_path, phenotype, target_code_type): + + logger.debug(f"Target coding format: {target_code_type}") + codes_path = phen_path / CODES_DIR # Create output dataframe out = pd.DataFrame([]) code_errors = [] @@ -717,7 +746,7 @@ def map(phen_dir, target_code_type): write_vocab_version(phen_path) - logger.info(f"Phenotype processed successfully") + logger.info(f"Phenotype processed target code type {target_code_type}") def publish(phen_dir): diff --git a/docs/usage.md b/docs/usage.md index 1527165..57271cf 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -96,9 +96,8 @@ The `phen` command is used phenotype-related operations. acmc phen map -d <PHENOTYPE_DIRECTORY> -t <TARGET_CODING> -o <OUTPUT_FORMAT> ``` - - `-t`, `--target-coding`: Specify the target coding (e.g., `read2`, `read3`, `icd10`, `snomed`, `opcs4`). + - `-t`, `--target-coding`: (Optional) Specify the target coding (e.g., `read2`, `read3`, `icd10`, `snomed`, `opcs4`). - `-d`, `--phen-dir`: (Optional) Directory of phenotype configuration (the default is ./build/phen). - - `-o`, `--output`: Output format(s) (`csv`, `omop`, or both), default is 'csv'. - **Publish Phenotype Configuration** diff --git a/examples/config1.yaml b/examples/config1.yaml index 19fd9c6..09d0e80 100644 --- a/examples/config1.yaml +++ b/examples/config1.yaml @@ -4,6 +4,9 @@ phenotype: vocabulary_id: "ACMC_Example_1" vocabulary_name: "ACMC example 1 phenotype" vocabulary_reference: "https://git.soton.ac.uk/meldb/concepts-processing/-/tree/main/examples" + map: + - "read2" + - "read3" concept_sets: - name: "ABDO_PAIN" file: diff --git a/examples/config2.yaml b/examples/config2.yaml index 33d6df4..4c6252e 100644 --- a/examples/config2.yaml +++ b/examples/config2.yaml @@ -4,6 +4,9 @@ phenotype: vocabulary_id: "ACMC_Example_2" vocabulary_name: "ACMC example 2 phenotype" vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example" + map: + - "read2" + - "read3" concept_sets: - name: "CVD_EVENTS" file: diff --git a/examples/config3.yaml b/examples/config3.yaml index 926ab60..764d7d8 100644 --- a/examples/config3.yaml +++ b/examples/config3.yaml @@ -4,6 +4,10 @@ phenotype: vocabulary_id: "ACMC_Example_3" vocabulary_name: "ACMC example 3 phenotype" vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example" + map: + - "read2" + - "read3" + - "snomed" concept_sets: - name: "CVD_EVENTS" file: diff --git a/tests/test_acmc.py b/tests/test_acmc.py index 6533847..c4cb94e 100644 --- a/tests/test_acmc.py +++ b/tests/test_acmc.py @@ -91,7 +91,7 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file): assert "Phenotype validated successfully" in caplog.text # map phenotype - for code_type in ["read2", "read3", "snomed"]: + for code_type in ["read2", "read3"]: with caplog.at_level(logging.DEBUG): monkeypatch.setattr( sys, -- GitLab