diff --git a/.gitignore b/.gitignore index 17c54d31cde8a6e4bd91b5ef2ed88501ea28edf5..0c1bc8bd966744f01c0dd002b562d9e62034b0e7 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,9 @@ __pycache__ *.save* ~$* +# Build +.tox/ + # ACMC phenotype build files vocab/* diff --git a/README.md b/README.md index 75e848fc916b7f97f5906ab11d0f1ebf009c6bbf..11254581fb32a020451e73893fa92dc850c5cc51 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ Once installed, you'll be ready to use the `acmc` tool along with the associated - [ICD10 Edition 5 XML](https://isd.digital.nhs.uk/trud/users/guest/filters/0/categories/28/items/259/releases) - [OPCS-4.10 Data Files](https://isd.digital.nhs.uk/trud/users/guest/filters/0/categories/10/items/119/releases) - After subscribing, you'll receive an API key once your request is approved (usually within 24 hours). + After subscribing, you'll receive an API key once your request is approved (usually within a few hours). 4. **Get TRUD API KEY** @@ -98,7 +98,7 @@ Once installed, you'll be ready to use the `acmc` tool along with the associated * 154) NHS Ethnic Category * 155) NHS Place of Service - You will be notified by email with a vocabularies version number and link to download a zip file of OMOP database tables in CSV format. The subject will be `OHDSI Standardized Vocabularies. Your download link` from `pallas@ohdsi.org` + You will be notified by email (usually within an hour) with a vocabularies version number and link to download a zip file of OMOP database tables in CSV format. The subject will be `OHDSI Standardized Vocabularies. Your download link` from `pallas@ohdsi.org` ``` Content of your package @@ -127,7 +127,7 @@ Please execute the following process: Load the unpacked files into the tables. ``` - Download the OMOP file onto your computer and note the path to the file +Download the OMOP file onto your computer and note the path to the file 4. **Install OMOP vocabularies** @@ -155,7 +155,7 @@ Expected output: ## **Example Usage** -Follow these steps to initialize and manage a phenotype using `acmc`. In this example, we use a source concept code list for the Concept Set `Abdominal Pain` created from [ClinicalCodes.org](ClinicalCodes.org). The source concept codes are is read2. We genereate versioned phenotypes for read2 and then translate to snomed with a another version. +Follow these steps to initialize and manage a phenotype using `acmc`. In this example, we use a source concept list for the Concept Set `Abdominal Pain` created from [ClinicalCodes.org](ClinicalCodes.org). The source concept codes are read2. We genereate versioned phenotypes for read2 and translate to snomed in normalised, standard formats. 1. **Initialize a phenotype in the workspace** @@ -181,7 +181,7 @@ Expected Output: cp -r ./examples/codes/* ./workspace/phen/codes ``` - - [Download `res176-abdominal-pain.csv`](.//examples/codes/clinical-codes-org/Symptom%20code%20lists/Abdominal%20pain/res176-abdominal-pain.csv) + - You can view the source code list here [`res176-abdominal-pain.csv`](.//examples/codes/clinical-codes-org/Symptom%20code%20lists/Abdominal%20pain/res176-abdominal-pain.csv) - Alternatively, place your code lists in `./workspace/phen/codes`. 3. **Copy the example phenotype configuration file to the phenotype directory** @@ -192,7 +192,7 @@ cp -r ./examples/codes/* ./workspace/phen/codes cp -r ./examples/config.json ./workspace/phen ``` - - [Download `config.json`](./examples/config.json) + - You can view the configuarion file here [`config.json`](./examples/config.json) - Alternatively, place your own `config.json` file in `./workspace/phen`. 4. **Validate the phenotype configuration** @@ -330,7 +330,7 @@ Expected Output: ## Support -For issues, open an [issue in the repository](https://git.soton.ac.uk/meldb/concepts-processing/-/issues) +If you need help please open an [issue in the repository](https://git.soton.ac.uk/meldb/concepts-processing/-/issues) ## Contributing diff --git a/acmc/main.py b/acmc/main.py index 43fd39de75324939e27a3835b2235e9509f0866a..ed2b7aff3d6a14a79cb5881c1b67b7fee82c33b3 100644 --- a/acmc/main.py +++ b/acmc/main.py @@ -39,6 +39,11 @@ def phen_map(args): phen.map(args.phen_dir, args.target_coding) +def phen_export(args): + """Handle the `phen copy` command.""" + phen.export(args.phen_dir, + args.version) + def phen_publish(args): """Handle the `phen publish` command.""" phen.publish(args.phen_dir) @@ -49,12 +54,6 @@ def phen_copy(args): args.target_dir, args.version) -def phen_copy(args): - """Handle the `phen copy` command.""" - phen.copy(args.phen_dir, - args.target_dir, - args.version) - def phen_diff(args): """Handle the `phen diff` command.""" phen.diff(args.phen_dir, @@ -145,6 +144,20 @@ def main(): help="Specify output format(s): 'csv', 'omop', or both (default: csv)") phen_map_parser.set_defaults(func=phen_map) + # phen export + phen_export_parser = phen_subparsers.add_parser("export", help="Export phen to OMOP database") + phen_export_parser.add_argument("-d", + "--phen-dir", + type=str, + default=str(phen.DEFAULT_PHEN_PATH.resolve()), + help="Phenotype workspace directory") + phen_export_parser.add_argument("-v", + "--version", + type=str, + default='latest', + help="Phenotype version to export, defaults to the latest version") + phen_export_parser.set_defaults(func=phen_export) + # phen publish phen_publish_parser = phen_subparsers.add_parser("publish", help="Publish phenotype configuration") phen_publish_parser.add_argument("-d", diff --git a/acmc/omop.py b/acmc/omop.py index e63cc278311ec4d59c2b4b6bf4f2e7f9fa673ee6..2a0bfd1ff0ee7284ff1b8e81387a7d7c4785be93 100644 --- a/acmc/omop.py +++ b/acmc/omop.py @@ -5,6 +5,7 @@ import pandas as pd import json import logging import zipfile +import shutil from pathlib import Path @@ -18,6 +19,7 @@ VOCAB_PATH = Path('./vocab/omop') DB_PATH = VOCAB_PATH / 'omop_54.sqlite' VERSION_FILE = 'omop_version.json' VERSION_PATH = VOCAB_PATH / VERSION_FILE +EXPORT_FILE = 'omop_export.db' vocabularies = { "source": "OHDSI Athena", @@ -39,6 +41,17 @@ vocabularies = { "tables": [] } +omop_vocab_types = { + "read2": "Read", + "read3": None, + "icd10": "ICD10CM", + "snomed": "SNOMED", + "opcs4": "OPCS4", + "atc": "ATC", + "med": None, + "cprd": None, +} + #Populate SQLite3 Database with default OMOP CONCEPTS def install (omop_zip_file, version): """Installs the OMOP release csv files in a file-based sql database""" @@ -167,80 +180,101 @@ def vocab_exists(cursor, vocab_id): return result is not None -def setup(db_path, vocab_id, vocab_version, vocab_name, vocab_reference): - #Setup SQLite3 Database for OMOP - conn = sqlite3.connect(db_path) - cur = conn.cursor() +def concept_set_exist(cursor, concept_set_name): + + query = f"SELECT EXISTS (SELECT 1 FROM CONCEPT_SET WHERE concept_set_name = ?)" + cursor.execute(query, (concept_set_name,)) + + # 1 if exists, 0 otherwise + return cursor.fetchone()[0] 1 - #Check if DB populated with necessary VOCABULARY - if not table_exists(cur, "VOCABULARY"): - raise Exception(f"Error {db_path} is not yet populated with OMOP VOCABULARY. Please download from https://athena.ohdsi.org/.") - - #Check if Vocabulary already exists - elif not omop_vocab_exists(cur, vocab_id): - #Create VOCABULARY - df_test = pd.DataFrame([{ - "vocabulary_id": vocab_id, - "vocabulary_name": vocab_name, - "vocabulary_reference": vocab_reference, - "vocabulary_version": vocab_version, - # "vocabulary_concept_id": 0, - }]) - df_test.to_sql("VOCABULARY", conn, if_exists='append', index=False) - - #Check if CONCEPT_SET table exists - if not table_exists(cur, "CONCEPT_SET"): - cur.execute(""" - CREATE TABLE CONCEPT_SET ( - concept_set_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each concept set - atlas_id INTEGER, -- Unique identifier generated by ATLAS - concept_set_name TEXT, -- Optional name for the concept set - concept_set_description TEXT, -- Optional description for the concept set - vocabulary_id TEXT NOT NULL, -- Foreign key to VOCABULARY table - FOREIGN KEY (vocabulary_id) REFERENCES VOCABULARY(vocabulary_id) - );""") +def export(map_path, export_path, version, omop_metadata): + logger.debug(f"exporting with metadata {omop_metadata} at version {version}") + + # copy the baseline omop database + export_db_path = export_path / EXPORT_FILE + shutil.copy(DB_PATH, export_db_path) + + # connect to db + conn = sqlite3.connect(export_db_path) + cur = conn.cursor() + + #Create VOCABULARY + df_test = pd.DataFrame([{ + "vocabulary_id": omop_metadata['vocabulary_id'], + "vocabulary_name": omop_metadata['vocabulary_name'], + "vocabulary_reference": omop_metadata['vocabulary_reference'], + "vocabulary_version": version, + # "vocabulary_concept_id": 0, + }]) + df_test.to_sql("VOCABULARY", conn, if_exists='append', index=False) + + # Create CONCEPT_SET + cur.execute(""" + CREATE TABLE CONCEPT_SET ( + concept_set_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each concept set + atlas_id INTEGER, -- Unique identifier generated by ATLAS + concept_set_name TEXT, -- Optional name for the concept set + concept_set_description TEXT, -- Optional description for the concept set + vocabulary_id TEXT NOT NULL, -- Foreign key to VOCABULARY table + FOREIGN KEY (vocabulary_id) REFERENCES VOCABULARY(vocabulary_id) + );""") - #Check if CONCEPT_SET_ITEM table exists - if not table_exists(cur, "CONCEPT_SET_ITEM"): - cur.execute(""" - CREATE TABLE CONCEPT_SET_ITEM ( - concept_set_item_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each mapping - concept_set_id INTEGER NOT NULL, -- Foreign key to CONCEPT_SET table - concept_id INTEGER NOT NULL, -- Foreign key to CONCEPT table - FOREIGN KEY (concept_set_id) REFERENCES CONCEPT_SET(concept_set_id), - FOREIGN KEY (concept_id) REFERENCES CONCEPT(concept_id) - );""") - - conn.close() - -def publish_concept_sets(out, db_path, vocab_output, vocab_type, output_version): - conn = sqlite3.connect(db_path) - cur = conn.cursor() - - for concept_set_name, grp in out.groupby("CONCEPT_SET"): - #Create Concept_Set - if not sql_row_exist(conn, "CONCEPT_SET", "concept_set_name", concept_set_name): - cur.execute(f"INSERT INTO CONCEPT_SET (concept_set_name, vocabulary_id) VALUES ('{concept_set_name}', 'MELDB');") - else: - logger.debug("concept_set", concept_set_name, "already exists") - #TODO: ask to remove old concept_set? - - #Get Concept_set_Id - query = "SELECT concept_set_id FROM CONCEPT_SET WHERE concept_set_name = ? AND vocabulary_id = ?;" - cur.execute(query, (concept_set_name, vocab_output, )) - concept_set_id = cur.fetchone()[0] - - #Get corresponing Concept_id (OMOP) for each Concept_code (e.g. SNOMED) - concept_codes = "'"+"', '".join(list(grp["CONCEPT"].astype(str)))+"'" - query = f"SELECT concept_id FROM CONCEPT WHERE vocabulary_id = ? AND concept_code IN ({concept_codes});" - cur.execute(query, (vocab_type, )) - df_out = pd.DataFrame(cur.fetchall(), columns=["concept_id"]) - - if not len(grp) == len(df_out): - logger.error("ERROR: Some", vocab_type, "Codes do not exist in OMOP Database") - - #Create Concept_set_item - df_out["concept_set_id"] = concept_set_id - df_out.to_sql("CONCEPT_SET_ITEM", conn, if_exists='append', index=False) - - conn.close() + # Create CONCEPT_SET_ITEM + cur.execute(""" + CREATE TABLE CONCEPT_SET_ITEM ( + concept_set_item_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each mapping + concept_set_id INTEGER NOT NULL, -- Foreign key to CONCEPT_SET table + concept_id INTEGER NOT NULL, -- Foreign key to CONCEPT table + FOREIGN KEY (concept_set_id) REFERENCES CONCEPT_SET(concept_set_id), + FOREIGN KEY (concept_id) REFERENCES CONCEPT(concept_id) + );""") + + + # read map files + map_files = list(map_path.glob("*.csv")) + total = len(map_files) + logger.info(f"Exporting {total} map files") + for index, map_file in enumerate(map_files): + logger.info(f"Processing {index+1} of {total}: {map_file}") + df = pd.read_csv(map_file) + + for concept_set_name, grp in df.groupby("CONCEPT_SET"): + + # create Concept_Set + if not concept_set_exist(cur, concept_set_name): + cur.execute(f"INSERT INTO CONCEPT_SET (concept_set_name, vocabulary_id) VALUES ('{concept_set_name}', '{omop_metadata['vocabulary_id']}');") + else: + logger.debug(f"Concept_set {concept_set_name} already exists") + #TODO: ask to remove old concept_set? + + # get Concept_set_Id + query = "SELECT concept_set_id FROM CONCEPT_SET WHERE concept_set_name = ? AND vocabulary_id = ?;" + target_code_type = map_file.stem + cur.execute(query, (concept_set_name, omop_metadata['vocabulary_id'], )) + # FAILS HERE WITH NONE REUR + logger.debug(f"target code type {target_code_type}") + logger.debug(f"omop code type {omop_vocab_types[target_code_type]}") + concept_set_id = cur.fetchone()[0] + logger.debug(f"concept set id {concept_set_id}") + + # get corresponing Concept_id (OMOP) for each Concept_code (e.g. SNOMED) + concept_codes = "'"+"', '".join(list(grp["CONCEPT"].astype(str)))+"'" + query = f"SELECT concept_id FROM CONCEPT WHERE vocabulary_id = ? AND concept_code IN ({concept_codes});" + cur.execute(query, (omop_vocab_types[target_code_type], )) + df_out = pd.DataFrame(cur.fetchall(), columns=["concept_id"]) + + if not len(grp) == len(df_out): + logger.error(f"ERROR: Some {omop_vocab_types[target_code_type]} Codes do not exist in OMOP Database") + + #Create Concept_set_item + df_out["concept_set_id"] = concept_set_id + df_out.to_sql("CONCEPT_SET_ITEM", conn, if_exists='append', index=False) + + conn.close() + + logger.debug(f"Created export db successfully") + + return export_db_path + + return export_db_path diff --git a/acmc/parse.py b/acmc/parse.py index bbd95f0d7d7ba213c63a7e7de79627546d601eb4..7595ecbafff83c37631dbbe8794809cf8a4bbe5a 100644 --- a/acmc/parse.py +++ b/acmc/parse.py @@ -411,14 +411,3 @@ class CodeTypeParser(): "med": Med(), "cprd": Cprd(), } - -vocab_types = { - "read2": "Read", - "read3": None, - "icd10": "ICD10CM", - "snomed": "SNOMED", - "opcs4": "OPCS4", - "atc": "ATC", - "med": None, - "cprd": None, -} \ No newline at end of file diff --git a/acmc/phen.py b/acmc/phen.py index e1ba3d6f64b162eab8e4a02136afc2079a6690f1..130b81b90137f1d89eddb8cc8364362fde9bd76e 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -27,7 +27,8 @@ DEFAULT_PHEN_PATH = Path('./workspace') / PHEN_DIR CODES_DIR = 'codes' MAP_DIR = 'map' CONCEPT_SET_DIR = 'concept-set' -DEFAULT_PHEN_DIR_LIST = [CODES_DIR, MAP_DIR, CONCEPT_SET_DIR] +OMOP_DIR = 'omop' +DEFAULT_PHEN_DIR_LIST = [CODES_DIR, MAP_DIR, CONCEPT_SET_DIR, OMOP_DIR] CONFIG_FILE = 'config.json' DEFAULT_GIT_BRANCH = 'main' @@ -626,6 +627,39 @@ def publish(phen_dir): logger.info(f"Phenotype published successfully") +def export(phen_dir, version): + """Exports a phen repo at a specific tagged version into a target directory""" + logger.info(f"Exporting phenotype {phen_dir} at version {version}") + + # validate configuration + validate(phen_dir) + phen_path = Path(phen_dir) + + # load configuration + config_path = phen_path / CONFIG_FILE + config = json.load(open(config_path, "rb")) + + map_path = phen_path / MAP_DIR + if not map_path.exists(): + logger.warning(f"Map path does not exist '{map_path}'") + + export_path = phen_path / OMOP_DIR + # check export directory exists and if not create it + if not export_path.exists(): + export_path.mkdir(parents=True) + logger.debug(f"OMOP export directory '{export_path}' created.") + + # omop export db + export_db_path = omop.export(map_path, + export_path, + config['concept_sets']['version'], + config['concept_sets']['omop']) + + + # write to tables + # export as csv + logger.info(f"Phenotype exported successfully") + def copy(phen_dir, target_dir, version): """Copys a phen repo at a specific tagged version into a target directory""" @@ -650,6 +684,7 @@ def copy(phen_dir, target_dir, version): # If copy directory exists, open the repo logger.debug(f"Copy of repository already exists in {copy_path}. Opening the repo...") repo = git.Repo(copy_path) + # Check out the latest commit or specified version if version: # Checkout a specific version (e.g., branch, tag, or commit hash) diff --git a/pyproject.toml b/pyproject.toml index 84e7cf409a6ecfaa860cf0b3269f5fa678272765..aeb94388c9a0e46b0627dc71a99e322d692f6d0b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,3 +63,7 @@ dependencies = [ "hatch", "pytest" ] + +[tool.hatch.envs.default.scripts] +dev = "python -m acmc" +test = "pytest tests" \ No newline at end of file diff --git a/tests/test_acmc.py b/tests/test_acmc.py index c02563f55a430523619cc808e767f3497bc3bf46..b53f97c9fb543acec7481f99856786659f49aade 100644 --- a/tests/test_acmc.py +++ b/tests/test_acmc.py @@ -60,7 +60,6 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog): shutil.copytree(source, destination) else: shutil.copy(source, destination) - shutil.copy( phen_path / 'config1.json', phen_path / 'config.json') monkeypatch.setattr(sys, "argv", ["main.py", "phen", "validate", "-d", str(phen_path.resolve())]) main.main() diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000000000000000000000000000000000000..d8aaa4cf1ccc45f00795f8454ff47580d2d049b1 --- /dev/null +++ b/tox.ini @@ -0,0 +1,18 @@ +[tox] +envlist = py39, py310, py311 +isolated_build = true + +[testenv] +description = Run pytests with hatch +deps = + pytest + hatch +commands = + hatch run test + +[testenv:build] +description = Build package using Hatch +skip_install = true +deps = hatch +commands = + hatch build