diff --git a/README.md b/README.md index a09ca1148f8b05df3659fb50b853b17ff9641073..35c37969f2439ce38d0ec82797b6326350287516 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ Go to your [NHS TRUD Account Management](https://isd.digital.nhs.uk/trud/users/a Execute the following script to download, install and process TRUD resources -`python trud.py --key <API_KEY>`. +`python acmc.py trud install --key <API_KEY>`. Processed TRUD resources are saved as `.parquet` files in the `build/maps/processed/` directory. @@ -113,7 +113,7 @@ The vocabularies will not be available immediately, you will be notified by emai * Install vocabularies using the following command: -`python omop.py --install <PATH_TO_DOWNLOADED_FILES>` +`python acmc.py omop install -f <Path to extracted OMOP downloads folder>` ## Defining phenotypes @@ -249,33 +249,100 @@ Need to split column into multiple columns, so only one code type per column. **<b>Large Code lists</b> with numerous phenotypes (e.g. Ho et al), require lots of JSON to be generated. See the "Ho generate JSON" section in process_codes_WP.ipynb for example code to generate* +## Usage - ACMC Command-Line Tool + ## Usage +The tool follows a structured command system: + +```bash +python acmc.py <command> <subcommand> [options] +``` + +### Available Commands +- **`trud`** – Manage TRUD components +- **`omop`** – Manage OMOP codes and database +- **`map`** – Process mapping configurations -Script preprocess code lists and to map to given concept/phenotype +--- -### Execute Command Line -Execute via shell with customizable parameters: +## TRUD Command +### Install TRUD Components +```bash +acmc trud install -k <TRUD_API_KEY> +``` +**Options:** +- `-k, --api-key` _(required)_ – TRUD API key + +--- +## OMOP Commands +### Install OMOP Codes ```bash -python acmc.py [-h] [-r2] [-r3] [-i] [-s] [-o] [-a] [--no-translate] [--no-verify] [--output] [--error-log] mapping_file +acmc omop install -f <OMOP_FOLDER_PATH> ``` +**Options:** +- `-f, --omop-folder` _(required)_ – Path to extracted OMOP downloads folder + +### Clear OMOP Data +```bash +acmc omop clear +``` +_Removes OMOP data from the database._ + +### Delete OMOP Database +```bash +acmc omop delete +``` +_Deletes the entire OMOP database._ + +--- + +## MAP Commands +### Process Phenotype Configuration +```bash +acmc map process -c <CONFIG_FILE> -s <SOURCE_CODES_DIR> -o <OUTPUT_DIR> -t <TARGET_CODING> [options] +``` + +**Required Options:** +- `-c, --config-file` – Path to the phenotype configuration file +- `-s, --source-codes-dir` – Root directory of source codes +- `-o, --output-dir` – Directory for CSV or OMOP database output +- `-t, --target-coding` – Target coding system _(choices: read2, read3, icd10, snomed, opcs4)_ + +**Optional Flags:** +- `-tr, --translate` – Enable code translation (default: disabled) +- `-v, --verify` – Enable code verification (default: disabled) + +**Optional Arguments:** +- `-l, --error-log` – Filepath to save error log (default: `error.csv`) + +--- + +## Examples +### Install TRUD Components +```bash +acmc trud install -k my-trud-api-key +``` + +### Install OMOP Codes +```bash +acmc omop install -f /path/to/omop +``` + +### Process Mapping Configuration with Read2 Target Coding +```bash +acmc map process -c config.json -s /data/source -o /data/output -t read2 --translate --verify +``` + +## License +MIT License + +## Support +For issues, open a ticket in the repository or contact support@example.com. + + + -**Required Arguments:** - - `mapping_file` Concept/Phenotype Assignment File (json) - - `--output` Filepath to save output to CSV or OMOP SQLite Database - -**Options Arguments:** - - `-r2`, `--read2-code` Read V2 Codes Column name in Source File - - `-r3`, `--read3-code` Read V3 Codes Column name in Source File - - `-i`, `--icd10-code` ICD10 Codes Column name in Source File - - `-s`, `--snomed-code` SNOMED Codes Column name in Source File - - `-o`, `--opcs4-code` OPCS4 Codes Column name in Source File - - `-a`, `--atc-code` ATC Codes Column name in Source File - - `--no-translate` Do not translate code types - - `--no-verify` Do not verify codes are correct - - `--error-log` Filepath to save error log to - -> **_EXAMPLE:_** `python main.py PHEN_assign_v3.json -r2 --output output/MELD_concepts_readv2.csv --error-log output/MELD_errors.csv` ## Contributing diff --git a/acmc.py b/acmc.py index 7c36a389b6f9b96d927156e8dde6faa043388271..f7c581735bd87e0fc5df030e35fe4241c6ad03c7 100644 --- a/acmc.py +++ b/acmc.py @@ -46,7 +46,13 @@ def map_process(args): else: args.error_log = 'errors.csv' - map.process(args.config_file, args.target_coding, args.translate, args.verify, args.error_log, output_path="MELD_concepts_read.csv") + map.process(args.config_file, + args.source_codes_dir, + args.target_coding, + args.translate, + args.verify, + args.error_log, + output_path="MELD_concepts_read.csv") print(f"Phenotype processing completed") @@ -71,7 +77,7 @@ def main(): # omop install omop_install_parser = omop_subparsers.add_parser("install", help="Install OMOP codes within database") - omop_install_parser.add_argument("-f", "--omop-folder", required=True, help="Path to OMOP downloads folder") + omop_install_parser.add_argument("-f", "--omop-folder", required=True, help="Path to extracted OMOP downloads folder") omop_install_parser.set_defaults(func=omop_install) # omop clear @@ -88,6 +94,7 @@ def main(): # map process map_process_parser = map_subparsers.add_parser("process", help="Process map configuration file") map_process_parser.add_argument("-c", "--config-file", required=True, help="Phenotype configuration file") + map_process_parser.add_argument("-s", "--source-codes-dir", required=True, help="Source codes root directory") map_process_parser.add_argument("-o", "--output-dir", required=True, help="Output directory for CSV or OMOP database") map_process_parser.add_argument("-t", "--target-coding", required=True, choices=['read2', 'read3', 'icd10', 'snomed', 'opcs4'], help="Specify the target coding (read2, read3, icd10, snomed, opcs4)") diff --git a/map.py b/map.py index 828ec4a349d4e663b23efe62175d3120d7c1a973..c468dfaeca4c08c5b8960f1b115f964912ad3a7b 100644 --- a/map.py +++ b/map.py @@ -4,6 +4,7 @@ import numpy as np import json import os import sqlite3 +from pathlib import Path from base import log_invalid_code from base import bcolors @@ -28,17 +29,18 @@ def read_table_file(path, excel_sheet=None): """ Load Code List File """ - if path.endswith(".csv"): + if path.suffix == ".csv": df = pd.read_csv(path, dtype=str) - elif path.endswith(".xlsx"): + elif path.suffix == ".xlsx": if excel_sheet: df = pd.read_excel(path, sheet_name=excel_sheet, dtype=str) else: df = pd.read_excel(path, dtype=str) - elif path.endswith(".dta"): + elif path.suffix == ".dta": df = pd.read_stata(path, dtype=str) else: - raise Exception("Unsupported filetype provided for source file") + raise Exception(f"Unsupported filetype provided for source file {path.suffix}") + return df @@ -64,7 +66,7 @@ def preprocess( meta_columns=[], file_path=None, verify=True, - Translate=True, + translate=True, ): """ Parses each column individually - Order and length will not be preserved! @@ -165,177 +167,184 @@ def sql_row_exist(conn, table, column, value): return exists -def process(config_file, target_code_type, translate=True, verify=True, log_errors_path="errors.csv", output_path="MELD_concepts_read.csv"): - - # Load configuration File - if config_file.endswith(".json"): - mapping = json.load(open(config_file, "rb")) - folders = mapping["codes"] - summary_config = mapping["concept_sets"] - else: - raise Exception(f"Unsupported filetype provided for configuration file: {config_file}") - - out = pd.DataFrame([]) # Create Output File to append to - - # Iterate JSON mapping file (OBJECT FORMAT) - for folder in folders: - print(bcolors.HEADER, folder["description"], bcolors.ENDC) - if "files" in folder: - for file in folder["files"]: - print("---" * 5, file["file"], "---" * 5) - file_path = folder["folder"] + "/" + file["file"] - - # Load Code File - if "excel_sheet" in file: - df = read_table_file( - path=file_path, excel_sheet=file["excel_sheet"] - ) - else: - df = read_table_file(path=file_path) - - # Perform Structural Changes to file before preprocessing - # split column with multiple code types - if ( - "actions" in file - and "split_col" in file["actions"] - and "codes_col" in file["actions"] - ): - split_col = file["actions"]["split_col"] - codes_col = file["actions"]["codes_col"] - print( - "Action: Splitting", - split_col, - "column into:", - df[split_col].unique(), - ) - codes = df[codes_col] - oh = pd.get_dummies(df[split_col], dtype=bool) # one hot encode - oh = oh.where((oh != True), codes, axis=0) # fill in 1s with codes - oh[oh == False] = np.NaN # replace 0s with None - df = pd.concat([df, oh], axis=1) # merge in new columns - - # Preprocessing & Validation Checks - if "columns" in file: - meta_columns = [] # meta columns to keep with codes - if "actions" in file and "divide_col" in file["actions"]: - meta_columns += [file["actions"]["divide_col"]] - # TODO: enable metacolumns to be outputted - problem with map_file appending - if "metadata" in file["columns"]: - meta_columns += file["columns"]["metadata"] - df = preprocess( - df, - file["columns"], - meta_columns=meta_columns, - file_path=file_path, - target_code_type=target_code_type, - verify=verify, - translate=translate, - ) - else: - raise Exception("No column format provided") - - # partition table by categorical column - if ( - "actions" in file - and "divide_col" in file["actions"] - and len(df) > 0 - ): - divide_col = file["actions"]["divide_col"] - print( - "Action: Dividing Table by", - divide_col, - "column into: ", - df[divide_col].unique(), - ) - df = df.groupby(divide_col) - - # Map to MELDB Concept/Phenotype - if len(df) == 0: - pass - # out = df - elif ("concept_set" in file) and isinstance( - df, pd.core.frame.DataFrame - ): - out = map_file( - df, - target_code_type, - out, - concepts=file["concept_set"], - meta_columns=meta_columns, - translate=translate, - ) - elif ("concept_set_categories" in file) and isinstance( - df, pd.core.groupby.generic.DataFrameGroupBy - ): - meta_columns.remove(divide_col) # delete categorical column - for cat, grp in df: - if ( - cat in file["concept_set_categories"].keys() - ): # check if category is mapped - grp = grp.drop( - columns=[divide_col] - ) # delete categorical column - print("Category:", cat) - out = map_file( - grp, - target_code_type, - out, - concepts=file["concept_set_categories"][cat], - meta_columns=meta_columns, - ) - - else: - print("Folder is empty") - - # check if out is empty - if len(out) <= 0: - raise Exception("Output file is empty") - - # Final Processing - out = out.reset_index(drop=True) - out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"]) - out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"]) - - # Add Concept Set Defintions metadata - summary_df = pd.DataFrame(summary_config["concept_set"]) # transform to dataframe - if "metadata" in summary_df.columns: - summary_df = summary_df.join( - pd.json_normalize(summary_df["metadata"]) - ) # metadata to columns - summary_df = summary_df.drop(columns=["metadata"]) - summary_df = summary_df.rename(columns={"concept_set_name": "CONCEPT_SET"}) - summary_df = summary_df.drop_duplicates() # remove duplicates - out = out.merge(summary_df, how="left", on="CONCEPT_SET") # merge with output - - # Save Output File - print(bcolors.HEADER, "---" * 5, "OUTPUT", "---" * 5, bcolors.ENDC) - print(out) - if output_path == "atlas": - - vocab_id = summary_config["omop"]["vocabulary_id"] - vocab_version = summary_config["version"] - vocab_name = summary_config["omop"]["vocabulary_name"] - vocab_reference = summary_config["omop"]["vocabulary_reference"] - - # Create New OMOP Vocabulary - omop_setup(OMOP_DB_PATH, vocab_id, vocab_version, vocab_name, vocab_reference) - - # Export to DB - omop_publish_concept_sets( - out, - OMOP_DB_PATH, - vocab_id, - omop_vocab_types[target_code_type], - vocab_version, - ) - else: - # export as CSV to /output - out.to_csv(output_path, index=False) - print("saved to", output_path) - - # Save Error File - if os.path.exists(log_errors_path): - error_df = pd.read_csv(log_errors_path) - error_df = error_df.drop_duplicates() # Remove Duplicates from Error file - error_df = error_df.sort_values(by=["SOURCE", "VOCABULARY", "CONCEPT"]) - error_df.to_csv(log_errors_path, index=False) +def process(config_file, source_codes_dir, target_code_type, translate=True, verify=True, log_errors_path="errors.csv", output_path="MELD_concepts_read.csv"): + config_path = Path(config_file) + if not config_path.is_file(): + raise FileNotFoundError(f"Error: phenotype configuration file '{config_path}' does not exist.") + + codes_path = Path(source_codes_dir) + if not codes_path.is_dir(): + raise FileNotFoundError(f"Error: source codes directory {source_codes_dir} does not exist.") + + # Load configuration File + if config_path.suffix == ".json": + mapping = json.load(open(config_path, "rb")) + folders = mapping["codes"] + summary_config = mapping["concept_sets"] + else: + raise Exception(f"Unsupported filetype for configuration file: {config_file}") + + out = pd.DataFrame([]) # Create Output File to append to + + # Iterate JSON mapping file (OBJECT FORMAT) + for folder in folders: + print(bcolors.HEADER, folder["description"], bcolors.ENDC) + if "files" in folder: + for file in folder["files"]: + print("---" * 5, file["file"], "---" * 5) + file_path = codes_path / folder["folder"] / file["file"] + + # Load Code File + if "excel_sheet" in file: + df = read_table_file( + path=file_path, excel_sheet=file["excel_sheet"] + ) + else: + df = read_table_file(path=file_path) + + # Perform Structural Changes to file before preprocessing + # split column with multiple code types + if ( + "actions" in file + and "split_col" in file["actions"] + and "codes_col" in file["actions"] + ): + split_col = file["actions"]["split_col"] + codes_col = file["actions"]["codes_col"] + print( + "Action: Splitting", + split_col, + "column into:", + df[split_col].unique(), + ) + codes = df[codes_col] + oh = pd.get_dummies(df[split_col], dtype=bool) # one hot encode + oh = oh.where((oh != True), codes, axis=0) # fill in 1s with codes + oh[oh == False] = np.nan # replace 0s with None + df = pd.concat([df, oh], axis=1) # merge in new columns + + # Preprocessing & Validation Checks + if "columns" in file: + meta_columns = [] # meta columns to keep with codes + if "actions" in file and "divide_col" in file["actions"]: + meta_columns += [file["actions"]["divide_col"]] + # TODO: enable metacolumns to be outputted - problem with map_file appending + if "metadata" in file["columns"]: + meta_columns += file["columns"]["metadata"] + df = preprocess( + df, + file["columns"], + meta_columns=meta_columns, + file_path=file_path, + target_code_type=target_code_type, + verify=verify, + translate=translate, + ) + else: + raise Exception("No column format provided") + + # partition table by categorical column + if ( + "actions" in file + and "divide_col" in file["actions"] + and len(df) > 0 + ): + divide_col = file["actions"]["divide_col"] + print( + "Action: Dividing Table by", + divide_col, + "column into: ", + df[divide_col].unique(), + ) + df = df.groupby(divide_col) + + # Map to MELDB Concept/Phenotype + if len(df) == 0: + pass + # out = df + elif ("concept_set" in file) and isinstance( + df, pd.core.frame.DataFrame + ): + out = map_file( + df, + target_code_type, + out, + concepts=file["concept_set"], + meta_columns=meta_columns, + translate=translate, + ) + elif ("concept_set_categories" in file) and isinstance( + df, pd.core.groupby.generic.DataFrameGroupBy + ): + meta_columns.remove(divide_col) # delete categorical column + for cat, grp in df: + if ( + cat in file["concept_set_categories"].keys() + ): # check if category is mapped + grp = grp.drop( + columns=[divide_col] + ) # delete categorical column + print("Category:", cat) + out = map_file( + grp, + target_code_type, + out, + concepts=file["concept_set_categories"][cat], + meta_columns=meta_columns, + ) + + else: + print("Folder is empty") + + # check if out is empty + if len(out) <= 0: + raise Exception("Output file is empty") + + # Final Processing + out = out.reset_index(drop=True) + out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"]) + out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"]) + + # Add Concept Set Defintions metadata + summary_df = pd.DataFrame(summary_config["concept_set"]) # transform to dataframe + if "metadata" in summary_df.columns: + summary_df = summary_df.join( + pd.json_normalize(summary_df["metadata"]) + ) # metadata to columns + summary_df = summary_df.drop(columns=["metadata"]) + summary_df = summary_df.rename(columns={"concept_set_name": "CONCEPT_SET"}) + summary_df = summary_df.drop_duplicates() # remove duplicates + out = out.merge(summary_df, how="left", on="CONCEPT_SET") # merge with output + + # Save Output File + print(bcolors.HEADER, "---" * 5, "OUTPUT", "---" * 5, bcolors.ENDC) + print(out) + if output_path == "atlas": + + vocab_id = summary_config["omop"]["vocabulary_id"] + vocab_version = summary_config["version"] + vocab_name = summary_config["omop"]["vocabulary_name"] + vocab_reference = summary_config["omop"]["vocabulary_reference"] + + # Create New OMOP Vocabulary + omop_setup(OMOP_DB_PATH, vocab_id, vocab_version, vocab_name, vocab_reference) + + # Export to DB + omop_publish_concept_sets( + out, + OMOP_DB_PATH, + vocab_id, + omop_vocab_types[target_code_type], + vocab_version, + ) + else: + # export as CSV to /output + out.to_csv(output_path, index=False) + print("saved to", output_path) + + # Save Error File + if os.path.exists(log_errors_path): + error_df = pd.read_csv(log_errors_path) + error_df = error_df.drop_duplicates() # Remove Duplicates from Error file + error_df = error_df.sort_values(by=["SOURCE", "VOCABULARY", "CONCEPT"]) + error_df.to_csv(log_errors_path, index=False) diff --git a/mjb-conda.yaml b/mjb-conda.yaml index eb437f4fd3660f1c7766e1b4fe99ad67e0ce9166..5249e4be004ccd134d7be24dc01ea2b028571cf1 100644 --- a/mjb-conda.yaml +++ b/mjb-conda.yaml @@ -79,10 +79,12 @@ dependencies: - aiosqlite==0.21.0 - click==8.1.8 - cramjam==2.9.1 + - et-xmlfile==2.0.0 - fastparquet==2024.11.0 - fsspec==2025.2.0 - greenlet==3.1.1 - lxml==5.3.1 + - openpyxl==3.1.5 - pyarrow==19.0.0 - pyomop==4.3.0 - simpledbf==0.2.6 diff --git a/omop.py b/omop.py index 37b2b0c4bccef4343ea0fc88d14439f8a2b7514d..160d4c3d919784c47a41df534d6bf65d8bd40d9c 100644 --- a/omop.py +++ b/omop.py @@ -43,6 +43,32 @@ def install (db_path, omop_install_folder): conn.close() +def clear(db_path): + omop_db_path = Path(db_path) + if not omop_db_path.is_file(): + raise FileNotFoundError(f"Error: OMOP DB file '{omop_db_path}' does not exist.") + + conn = sqlite3.connect(db_path) + cur = conn.cursor() + + cur.execute("SELECT name FROM sqlite_master WHERE type='table';") + + # Fetch and print table names + tables = cur.fetchall() + print("Tables in database:", [table[0] for table in tables]) + + #cur.execute("DROP TABLE CONCEPT_SET;") + #cur.execute("DROP TABLE CONCEPT_SET_ITEM;") + + conn.close() + +def delete(db_path): + omop_db_path = Path(db_path) + if not omop_db_path.is_file(): + raise FileNotFoundError(f"Error: OMOP DB file '{omop_db_path}' does not exist.") + + omop_db_path.unlink() + def table_exists(cursor, table_name): # Query to check if the table exists cursor.execute( @@ -152,29 +178,3 @@ def publish_concept_sets(out, db_path, vocab_output, vocab_type, output_version) df_out.to_sql("CONCEPT_SET_ITEM", conn, if_exists='append', index=False) conn.close() - -def clear(db_path): - omop_db_path = Path(db_path) - if not omop_db_path.is_file(): - raise FileNotFoundError(f"Error: OMOP DB file '{omop_db_path}' does not exist.") - - conn = sqlite3.connect(db_path) - cur = conn.cursor() - - cur.execute("SELECT name FROM sqlite_master WHERE type='table';") - - # Fetch and print table names - tables = cur.fetchall() - print("Tables in database:", [table[0] for table in tables]) - - #cur.execute("DROP TABLE CONCEPT_SET;") - #cur.execute("DROP TABLE CONCEPT_SET_ITEM;") - - conn.close() - -def delete(db_path): - omop_db_path = Path(db_path) - if not omop_db_path.is_file(): - raise FileNotFoundError(f"Error: OMOP DB file '{omop_db_path}' does not exist.") - - omop_db_path.unlink() diff --git a/trud.py b/trud.py index 230231c0480473a2bb06c258f496270b6a2a946a..0d36c26a7b345bbab226585bbfe4e4080275ff79 100644 --- a/trud.py +++ b/trud.py @@ -15,7 +15,7 @@ import simpledbf # Constants FQDN = "isd.digital.nhs.uk" -MAPS_DIR = Path('./build/maps') +MAPS_DIR = Path('./build/trud') MAPS_DOWNLOADS_DIR = MAPS_DIR / 'downloads' MAPS_PROCESSED_DIR = MAPS_DIR / 'processed'