diff --git a/.gitignore b/.gitignore index fd59bdf42f395ff13502d3333df4d305c3b963ab..17c54d31cde8a6e4bd91b5ef2ed88501ea28edf5 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,9 @@ __pycache__ ~$* # ACMC phenotype build files -build/* + +vocab/* +workspace/* +v[0-9]*.[0-9]*.[0-9]*/ *output* *.log diff --git a/README.md b/README.md index faaba5d44d6f2b722313eb19e12288ce007d4f08..621a30914ab520a71c120e2871763fba3ef75f25 100644 --- a/README.md +++ b/README.md @@ -152,12 +152,13 @@ acmc [OPTIONS] COMMAND [SUBCOMMAND] [ARGUMENTS] ``` Where: -- `[OPTIONS]` are global options that apply to all commands (e.g., `--debug`). +- `[OPTIONS]` are global options that apply to all commands (e.g., `--debug`, `--version`). - `[COMMAND]` is the top-level command (e.g., `trud`, `omop`, `phen`). - `[SUBCOMMAND]` refers to the specific operation within the command (e.g., `install`, `validate`). ### Global Options +- `--version`: Display the acmc tool version number - `--debug`: Enable debug mode for more verbose logging. ### Commands @@ -277,21 +278,16 @@ The `phen` command is used phenotype-related operations. ## License + MIT License ## Support -For issues, open a ticket in the repository or contact support@example.com. -## Contributing +For issues, open an [issue in the repository](https://git.soton.ac.uk/meldb/concepts-processing/-/issues) -### Commit to GitLab +## Contributing -``` -git add . -git commit -m "my message ..." -git tag -a v1.0.0 -m "added features ..." -git push -``` +Please contacted the corresponding author Jakub Dylag at J.J.Dylag@soton.ac.uk. ## Acknowledgements @@ -302,43 +298,3 @@ This project was developed in the context of the [MELD-B](https://www.southampto This work is licensed under a [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0).  - - - -## Overview - -### Workflow - -The high level steps to use the tools are outlined below: - -**1. Define concept sets:** A domain expert defines a list of [concept sets](#defining-concept-sets) for each observable characteristic of the phenotype using CSV file format (e.g., `PHEN_concept_sets.csv`). - -**2. Define concept code lists for concept sets:** A domain expert defines [code lists](#defining-concept-codes) for each concept set within the phenotype using supported coding list formats and stores them in the `/src` directory. - -**3. Define mapping from code lists to concept sets:** A domain expert defines a [phenotype mapping](#mapping-codes-to-concept-sets) that maps code lists to concept sets. - -**4. Generate versioned phenotype coding lists and translations:** A domain expert or data engineer processes the phenotype mappings [using the command line tool](#usage) to validate against NHS TRUD-registered codes and mapping and to generate versioned concept set code lists with translations between coding standards. - -### Supported Medical Coding Standards - -The tool supports verification and mapping across diagnostic coding formats below: - -| Medical Code | Verification | Translation to | -|---------------|--------------|-----------------------------------| -| Readv2 | NHS TRUD | Readv3, SNOMED, ICD10, OPCS4, ATC | -| Readv3 (CTV3) | NHS TRUD | Readv3, SNOMED, ICD10, OPCS4 | -| ICD10 | NHS TRUD | None | -| SNOMED | NHS TRUD | None | -| OPCS4 | NHS TRUD | None | -| ATC | None | None | - -- [**Read V2:**](https://digital.nhs.uk/services/terminology-and-classifications/read-codes) NHS clinical terminology standard used in primary care and replaced by SNOMED-CT in 2018; Still supported by some data providers as widely used in primary care, e.g. [SAIL Databank](https://saildatabank.com/) -- [**SNOMED-CT:**](https://icd.who.int/browse10/2019/en) international standard for clinical terminology for Electronic Healthcare Records adopted by the NHS in 2018; Mappings to Read codes are partially provided by [Clinical Research Practice Database (CPRD)](https://www.cprd.com/) and [NHS Technology Reference Update Distribution (TRUD)](https://isd.digital.nhs.uk/trud). -- [**ICD-10:**](https://icd.who.int/browse10/2019/en) International Classification of Diseases (ICD) is a medical classification list from the World Health Organization (WHO) and widely used in hospital settings, e.g. Hospital Episode Statistics (HES). -- [**ATC Codes:**](https://www.who.int/tools/atc-ddd-toolkit/atc-classification) Anatomical Therapeutic Chemical (ATC) Classification is a drug classification list from the World Health Organization (WHO) - -## Notes - - Processed resources will be saved in the `build/maps/processed/` directory. - -*Note: NHS TRUD provides one-way mappings. To reverse mappings, duplicate the `.parquet` file and reverse the filename (e.g., `read2_code_to_snomed_code.parquet` to `snomed_code_to_read2_code.parquet`).* \ No newline at end of file diff --git a/acmc/main.py b/acmc/main.py index db75ae20729d30576c25823ffae839e3b9935d73..6048c451d83d2921b95d6732bebe848713bc03b7 100644 --- a/acmc/main.py +++ b/acmc/main.py @@ -8,7 +8,7 @@ from acmc import trud, omop, phen, logging_config as lc # setup logging logger = lc.setup_logger() -BUILD_PATH = Path('build') +DEFAULT_WORKING_PATH = Path('./workspace') def trud_install(args): """Handle the `trud install` command.""" @@ -20,11 +20,11 @@ def omop_install(args): def omop_clear(args): """Handle the `omop clear` command.""" - omop.clear(omop.OMOP_DB_PATH) + omop.clear(omop.DB_PATH) def omop_delete(args): """Handle the `omop delete` command.""" - omop.delete(omop.OMOP_DB_PATH) + omop.delete(omop.DB_PATH) def phen_init(args): """Handle the `phen init` command.""" @@ -82,7 +82,7 @@ def main(): # omop install omop_install_parser = omop_subparsers.add_parser("install", help="Install OMOP codes within database") - omop_install_parser.add_argument("-d", "--omop-dir", type=str, default=str(omop.OMOP_DB_DIR.resolve()), help="Directory path to extracted OMOP downloads") + omop_install_parser.add_argument("-d", "--omop-dir", type=str, default=str(omop.VOCAB_PATH.resolve()), help="Directory path to extracted OMOP downloads") omop_install_parser.add_argument("-v", "--version", required=True, help="OMOP vocabularies release version") omop_install_parser.set_defaults(func=omop_install) @@ -140,15 +140,34 @@ def main(): # phen copy phen_copy_parser = phen_subparsers.add_parser("copy", help="Publish phenotype configuration") - phen_copy_parser.add_argument("-d", "--phen-dir", type=str, default=str(phen.DEFAULT_PHEN_PATH.resolve()), help="Phenotype directory") - phen_copy_parser.add_argument("-td", "--target-dir", type=str, default=str(BUILD_PATH.resolve()), help="Target directory for the copy") - phen_copy_parser.add_argument("-v", "--version", type=str, default=None, help="Phenotype version to copy") + phen_copy_parser.add_argument("-d", + "--phen-dir", + type=str, + default=str(phen.DEFAULT_PHEN_PATH.resolve()), + help="Phenotype directory") + phen_copy_parser.add_argument("-td", + "--target-dir", + type=str, + default=str(DEFAULT_WORKING_PATH.resolve()), + help="Target directory for the copy") + phen_copy_parser.add_argument("-v", + "--version", + type=str, + default='latest', + help="Phenotype version to copy, defaults to the latest version") phen_copy_parser.set_defaults(func=phen_copy) # phen diff phen_diff_parser = phen_subparsers.add_parser("diff", help="Publish phenotype configuration") - phen_diff_parser.add_argument("-d", "--phen-dir", type=str, default=str(phen.DEFAULT_PHEN_PATH.resolve()), help="The directory for the new phenotype version") - phen_diff_parser.add_argument("-old", "--phen-dir-old", type=str, default=str(phen.DEFAULT_PHEN_PATH.resolve()), help="The directory of the old phenotype version that is compared to the new one") + phen_diff_parser.add_argument("-d", + "--phen-dir", + type=str, + default=str(phen.DEFAULT_PHEN_PATH.resolve()), + help="The directory for the new phenotype version") + phen_diff_parser.add_argument("-old", + "--phen-dir-old", + required=True, + help="The directory of the old phenotype version that is compared to the new one") phen_diff_parser.set_defaults(func=phen_diff) # Parse arguments diff --git a/acmc/omop.py b/acmc/omop.py index ca447cf3cb9796464f249534ed9e10c4af17b8d7..e5d828a9dd0d8af63ba822b2e28c16cd67abd5e4 100644 --- a/acmc/omop.py +++ b/acmc/omop.py @@ -11,10 +11,11 @@ from acmc import logging_config # setup logging logger = logging_config.setup_logger() -OMOP_DB_DIR = Path('./build/omop') -OMOP_DB_PATH = OMOP_DB_DIR / 'omop_54.sqlite' +# constants +VOCAB_PATH = Path('./vocab/omop') +DB_PATH = VOCAB_PATH / 'omop_54.sqlite' VERSION_FILE = 'omop_version.json' -VERSION_PATH = OMOP_DB_DIR / VERSION_FILE +VERSION_PATH = VOCAB_PATH / VERSION_FILE vocabularies = { "source": "OHDSI Athena", @@ -37,7 +38,7 @@ vocabularies = { } #Populate SQLite3 Database with default OMOP CONCEPTS -def install (omop_install_folder, version, db_path=OMOP_DB_PATH): +def install (omop_install_folder, version): """Installs the OMOP release csv files in a file-based sql database""" logger.info(f"Installing OMOP database from {omop_install_folder}") @@ -45,14 +46,13 @@ def install (omop_install_folder, version, db_path=OMOP_DB_PATH): omop_install_path = Path(omop_install_folder) if not omop_install_path.is_dir(): raise NotADirectoryError(f"Error: '{omop_install_path}' for OMOP installation files is not a directory") - # check codes directory exists and if not create it - if not OMOP_DB_DIR.exists(): - OMOP_DB_DIR.mkdir(parents=True) - logger.debug(f"OMOP directory '{OMOP_DB_DIR}' created.") + if not VOCAB_PATH.exists(): + VOCAB_PATH.mkdir(parents=True) + logger.debug(f"OMOP directory '{VOCAB_PATH}' created.") # connect to database, if it does not exist it will be created - conn = sqlite3.connect(OMOP_DB_PATH) + conn = sqlite3.connect(DB_PATH) # Iterate through files in the folder for filename in os.listdir(omop_install_folder): if filename.endswith(".csv"): # Check if the file is a CSV diff --git a/acmc/parse.py b/acmc/parse.py index 0a9fd578575f5ae6d1318bddbec0597f330c3959..bbd95f0d7d7ba213c63a7e7de79627546d601eb4 100644 --- a/acmc/parse.py +++ b/acmc/parse.py @@ -88,7 +88,7 @@ class Proto(): class Read2(Proto): """ This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary.""" def __init__(self): - super().__init__('read2', trud.TRUD_PROCESSED_DIR / 'read2.parquet') + super().__init__('read2', trud.PROCESSED_PATH / 'read2.parquet') # validate checks self.checks = [ @@ -141,7 +141,7 @@ class Read2(Proto): class Read3(Proto): def __init__(self): - super().__init__('Read3', trud.TRUD_PROCESSED_DIR / 'read3.parquet') + super().__init__('Read3', trud.PROCESSED_PATH / 'read3.parquet') self.checks = [ ( @@ -188,7 +188,7 @@ class Read3(Proto): class Icd10(Proto): def __init__(self): - super().__init__('icd10', trud.TRUD_PROCESSED_DIR / 'icd10.parquet') + super().__init__('icd10', trud.PROCESSED_PATH / 'icd10.parquet') self.checks = [ ( @@ -254,7 +254,7 @@ class Icd10(Proto): class Snomed(Proto): def __init__(self): - super().__init__('snomed', trud.TRUD_PROCESSED_DIR / 'snomed.parquet') + super().__init__('snomed', trud.PROCESSED_PATH / 'snomed.parquet') self.checks = [ # ( @@ -311,7 +311,7 @@ class Snomed(Proto): class Opcs4(Proto): def __init__(self): - super().__init__('opcs4', trud.TRUD_PROCESSED_DIR / 'opcs4.parquet') + super().__init__('opcs4', trud.PROCESSED_PATH / 'opcs4.parquet') self.checks = [ ( @@ -396,7 +396,7 @@ class Cprd(Proto): class CodeTypeParser(): """A class used in InvalidCodesException to report an error if a code parser check fails""" - def __init__(self, trud_processed_dir=trud.TRUD_PROCESSED_DIR): + def __init__(self, trud_processed_dir=trud.PROCESSED_PATH): if not trud_processed_dir.exists() or not trud_processed_dir.is_dir(): raise FileNotFoundError(f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install") diff --git a/acmc/phen.py b/acmc/phen.py index 1b04b37e1f3368e39315e4c77f68cc5dad3c9fc7..f36d64c42946ffa89943b559daceb1a1c8714fcf 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -22,15 +22,13 @@ logger = lc.setup_logger() pd.set_option("mode.chained_assignment", None) PHEN_DIR = 'phen' -DEFAULT_PHEN_PATH = Path('build') / PHEN_DIR +DEFAULT_PHEN_PATH = Path('./workspace') / PHEN_DIR CODES_DIR = 'codes' MAP_DIR = 'map' CONCEPT_SET_DIR = 'concept-set' DEFAULT_PHEN_DIR_LIST = [CODES_DIR, MAP_DIR, CONCEPT_SET_DIR] - CONFIG_FILE = 'config.json' -REPORT_FILE = 'report.md' DEFAULT_GIT_BRANCH = 'main' @@ -377,7 +375,7 @@ def translate_codes(df, target_code_type): codes = pd.concat([codes, df[target_code_type]]) else: filename = f"{col_name}_to_{target_code_type}.parquet" - map_path = trud.TRUD_PROCESSED_DIR / filename + map_path = trud.PROCESSED_PATH / filename if map_path.exists(): col = df[col_name] df_map = pd.read_parquet(map_path) @@ -628,7 +626,7 @@ def publish(phen_dir): logger.info(f"Phenotype published successfully") -def copy(phen_dir, target_dir, version=None): +def copy(phen_dir, target_dir, version): """Copys a phen repo at a specific tagged version into a target directory""" # Validate @@ -641,11 +639,7 @@ def copy(phen_dir, target_dir, version=None): raise FileNotFoundError(f"The target directory {target_path} does not exist") # Set copy directory - if version: - copy_path = target_path / version - else: - copy_path = target_path / 'latest' - + copy_path = target_path / version logger.info(f"Copying repo {phen_path} to {copy_path}") if not copy_path.exists(): @@ -681,12 +675,10 @@ def diff(phen_dir, phen_old_dir): new_phen_path = Path(phen_dir) # Load report (FOR SOME REASON THIS WAS APPEND SO SET TO w for NOW) - report_path = new_phen_path / REPORT_FILE - if report_path.suffix == ".md": - report = open(report_path, 'w') - logger.debug(f"Writing to report file {str(report_path.resolve())}") - else: - raise ValueError(f"Unsupported filetype provided for report file {str(report_path.resolve())}") + report_file_name = old_phen_path.name + "_diff.md" + report_path = new_phen_path / report_file_name + report = open(report_path, 'w') + logger.debug(f"Writing to report file {str(report_path.resolve())}") # Get maps files from phenotype old_map_path = old_phen_path / MAP_DIR diff --git a/acmc/trud.py b/acmc/trud.py index 3de3d84699f59a16819c2c749f56bb450b55912a..5338a58ae071e58616c80df7eb8706744b38d9c2 100644 --- a/acmc/trud.py +++ b/acmc/trud.py @@ -16,11 +16,11 @@ logger = lc.setup_logger() # Constants FQDN = "isd.digital.nhs.uk" -TRUD_PATH = Path('./build/trud') +VOCAB_PATH = Path('./vocab/trud') VERSION_FILE = 'trud_version.json' -VERSION_PATH = TRUD_PATH / VERSION_FILE -TRUD_DOWNLOADS_DIR = TRUD_PATH / 'downloads' -TRUD_PROCESSED_DIR = TRUD_PATH / 'processed' +VERSION_PATH = VOCAB_PATH / VERSION_FILE +DOWNLOADS_PATH = VOCAB_PATH / 'downloads' +PROCESSED_PATH = VOCAB_PATH / 'processed' def error_exit(message): logger.error(message, "error") @@ -42,17 +42,17 @@ def get_releases(item_id, API_KEY, latest=False): return data.get("releases", []) -def download_release_file(item_id, release_ordinal, release, file_json_prefix, file_type=None, items_folder=TRUD_DOWNLOADS_DIR): +def download_release_file(item_id, release_ordinal, release, file_json_prefix, file_type=None): """Download specified file type for a given release of an item.""" # check folder is a directory - if not items_folder.is_dir(): - raise NotADirectoryError(f"Error: '{items_folder}' for OMOP installation files is not a directory") + if not DOWNLOADS_PATH.is_dir(): + raise NotADirectoryError(f"Error: '{DOWNLOADS_PATH}' for TRUD resources is not a directory") file_type = file_type or file_json_prefix file_url = release.get(f"{file_json_prefix}FileUrl") file_name = release.get(f"{file_json_prefix}FileName") - file_destination = TRUD_DOWNLOADS_DIR / file_name + file_destination = DOWNLOADS_PATH / file_name if not file_url or not file_name: error_exit(f"Missing {file_type} file information for release {release_ordinal} of item {item_id}.") @@ -76,35 +76,35 @@ def validate_download_hash(file_destination:str, item_hash:str): else: error_exit(f"Could not validate origin of {file_destination}. The SHA-256 hash should be: {item_hash}, but got {hash} instead") -def unzip_download(file_destination:str, items_folder=TRUD_DOWNLOADS_DIR): +def unzip_download(file_destination:str): # check folder is a directory - if not items_folder.is_dir(): - raise NotADirectoryError(f"Error: '{items_folder}' for OMOP installation files is not a directory") + if not DOWNLOADS_PATH.is_dir(): + raise NotADirectoryError(f"Error: '{DOWNLOADS_PATH}' for TRUD resoruces is not a directory") with zipfile.ZipFile(file_destination, 'r') as zip_ref: - zip_ref.extractall(items_folder) + zip_ref.extractall(DOWNLOADS_PATH) def extract_icd10(): #ICD10_edition5 - file_path = TRUD_DOWNLOADS_DIR / 'ICD10_Edition5_XML_20160401' / 'Content' / 'ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml' + file_path = DOWNLOADS_PATH / 'ICD10_Edition5_XML_20160401' / 'Content' / 'ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml' df = pd.read_xml(file_path) df = df[["CODE", "ALT_CODE", "DESCRIPTION"]] df = df.rename(columns={"CODE":"icd10", "ALT_CODE":"icd10_alt", "DESCRIPTION":"description" }) - output_path = TRUD_PROCESSED_DIR / 'icd10.parquet' + output_path = PROCESSED_PATH / 'icd10.parquet' df.to_parquet(output_path, index=False) logger.info(f"Extracted: {output_path}") def extract_opsc4(): - file_path = TRUD_DOWNLOADS_DIR / 'OPCS410 Data files txt' / 'OPCS410 CodesAndTitles Nov 2022 V1.0.txt' + file_path = DOWNLOADS_PATH / 'OPCS410 Data files txt' / 'OPCS410 CodesAndTitles Nov 2022 V1.0.txt' df = pd.read_csv(file_path, sep='\t', dtype=str, header=None) df = df.rename(columns={0:"opcs4", 1:"description"}) - output_path = TRUD_PROCESSED_DIR / 'opcs4.parquet' + output_path = PROCESSED_PATH / 'opcs4.parquet' df.to_parquet(output_path, index=False) logger.info(f"Extracted: {output_path}") @@ -112,30 +112,30 @@ def extract_nhs_data_migrations(): #NHS Data Migrations #snomed only - file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'sctcremap_uk_20200401000001.txt' + file_path = DOWNLOADS_PATH / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'sctcremap_uk_20200401000001.txt' df = pd.read_csv(file_path, sep='\t') df = df[["SCT_CONCEPTID"]] df = df.rename(columns={"SCT_CONCEPTID":"snomed"}) df = df.drop_duplicates() df = df.astype(str) - output_path = TRUD_PROCESSED_DIR / 'snomed.parquet' + output_path = PROCESSED_PATH / 'snomed.parquet' df.to_parquet(output_path, index=False) logger.info(f"Extracted: {output_path}") #r2 -> r3 - file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rctctv3map_uk_20200401000001.txt' + file_path = DOWNLOADS_PATH / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rctctv3map_uk_20200401000001.txt' df = pd.read_csv(file_path, sep='\t') df = df[["V2_CONCEPTID", "CTV3_CONCEPTID"]] df = df.rename(columns={"V2_CONCEPTID":"read2", "CTV3_CONCEPTID":"read3"}) - output_path = TRUD_PROCESSED_DIR / 'read2_to_read3.parquet' + output_path = PROCESSED_PATH / 'read2_to_read3.parquet' df.to_parquet(output_path, index=False) logger.info(f"Extracted: {output_path}") #r3->r2 - file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3rctmap_uk_20200401000002.txt' + file_path = DOWNLOADS_PATH / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3rctmap_uk_20200401000002.txt' df = pd.read_csv(file_path, sep='\t') df = df[["CTV3_CONCEPTID", "V2_CONCEPTID"]] df = df.rename(columns={"CTV3_CONCEPTID":"read3", @@ -143,23 +143,23 @@ def extract_nhs_data_migrations(): df = df.drop_duplicates() df = df[~df["read2"].str.match("^.*_.*$")] #remove r2 codes with '_' - output_path = TRUD_PROCESSED_DIR / 'read3_to_read2.parquet' + output_path = PROCESSED_PATH / 'read3_to_read2.parquet' df.to_parquet(output_path, index=False) logger.info(f"Extracted: {output_path}") #r2 -> snomed - file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rcsctmap2_uk_20200401000001.txt' + file_path = DOWNLOADS_PATH / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rcsctmap2_uk_20200401000001.txt' df = pd.read_csv(file_path, sep='\t', dtype=str) df = df[["ReadCode", "ConceptId"]] df = df.rename(columns={"ReadCode":"read2", "ConceptId":"snomed"}) - output_path = TRUD_PROCESSED_DIR / 'read2_to_snomed.parquet' + output_path = PROCESSED_PATH / 'read2_to_snomed.parquet' df.to_parquet(output_path, index=False) logger.info(f"Extracted: {output_path}") #r3->snomed - file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3sctmap2_uk_20200401000001.txt' + file_path = DOWNLOADS_PATH / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3sctmap2_uk_20200401000001.txt' df = pd.read_csv(file_path, sep='\t', dtype=str) df = df[["CTV3_TERMID", "SCT_CONCEPTID"]] df = df.rename(columns={"CTV3_TERMID":"read3", @@ -167,70 +167,70 @@ def extract_nhs_data_migrations(): df["snomed"] = df["snomed"].astype(str) df = df[~df["snomed"].str.match("^.*_.*$")] #remove snomed codes with '_' - output_path = TRUD_PROCESSED_DIR / 'read3_to_snomed.parquet' + output_path = PROCESSED_PATH / 'read3_to_snomed.parquet' df.to_parquet(output_path, index=False) logger.info(f"Extracted: {output_path}") def extract_nhs_read_browser(): #r2 only - input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ANCESTOR.DBF' + input_path = DOWNLOADS_PATH / 'Standard' / 'V2' / 'ANCESTOR.DBF' df = simpledbf.Dbf5(input_path).to_dataframe() df = pd.concat([df['READCODE'], df['DESCENDANT']]) df = pd.DataFrame(df.drop_duplicates()) df = df.rename(columns={0:"read2"}) - output_path = TRUD_PROCESSED_DIR / 'read2.parquet' + output_path = PROCESSED_PATH / 'read2.parquet' df.to_parquet(output_path, index=False) logger.info(f"Extracted: {output_path}") #r2 -> atc - input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ATC.DBF' + input_path = DOWNLOADS_PATH / 'Standard' / 'V2' / 'ATC.DBF' df = simpledbf.Dbf5(input_path).to_dataframe() df = df[["READCODE", "ATC"]] df = df.rename(columns={"READCODE":"read2", "ATC":"atc"}) - output_path = TRUD_PROCESSED_DIR / 'read2_to_atc.parquet' + output_path = PROCESSED_PATH / 'read2_to_atc.parquet' df.to_parquet(output_path, index=False) logger.info(f"Extracted: {output_path}") #r2 -> icd10 - input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ICD10.DBF' + input_path = DOWNLOADS_PATH / 'Standard' / 'V2' / 'ICD10.DBF' df = simpledbf.Dbf5(input_path).to_dataframe() df = df[["READ_CODE", "TARG_CODE"]] df = df.rename(columns={"READ_CODE":"read2", "TARG_CODE":"icd10"}) df = df[~df["icd10"].str.match("^.*-.*$")] #remove codes with '-' df = df[~df["read2"].str.match("^.*-.*$")] #remove codes with '-' - output_path = TRUD_PROCESSED_DIR / 'read2_to_icd10.parquet' + output_path = PROCESSED_PATH / 'read2_to_icd10.parquet' df.to_parquet(output_path, index=False) logger.info(f"Extracted: {output_path}") #r2 -> opcs4 - input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'OPCS4V3.DBF' + input_path = DOWNLOADS_PATH / 'Standard' / 'V2' / 'OPCS4V3.DBF' df = simpledbf.Dbf5(input_path).to_dataframe() df = df[["READ_CODE", "TARG_CODE"]] df = df.rename(columns={"READ_CODE":"read2", "TARG_CODE":"opcs4"}) df = df[~df["opcs4"].str.match("^.*-.*$")] #remove codes with '-' df = df[~df["read2"].str.match("^.*-.*$")] #remove codes with '-' - output_path = TRUD_PROCESSED_DIR / 'read2_to_opcs4.parquet' + output_path = PROCESSED_PATH / 'read2_to_opcs4.parquet' df.to_parquet(output_path, index=False) logger.info(f"Extracted: {output_path}") #r3 only - input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ANCESTOR.DBF' + input_path = DOWNLOADS_PATH / 'Standard' / 'V3' / 'ANCESTOR.DBF' df = simpledbf.Dbf5(input_path).to_dataframe() df = pd.concat([df['READCODE'], df['DESCENDANT']]) df = pd.DataFrame(df.drop_duplicates()) df = df.rename(columns={0:"read3"}) - output_path = TRUD_PROCESSED_DIR / 'read3.parquet' + output_path = PROCESSED_PATH / 'read3.parquet' df.to_parquet(output_path, index=False) logger.info(f"Extracted: {output_path}") #r3 -> icd10 - input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ICD10.DBF' + input_path = DOWNLOADS_PATH / 'Standard' / 'V3' / 'ICD10.DBF' df = simpledbf.Dbf5(input_path).to_dataframe() df = df[["READ_CODE", "TARG_CODE"]] df = df.rename(columns={"READ_CODE":"read3", "TARG_CODE":"icd10"}) df = df[~df["icd10"].str.match("^.*-.*$")] #remove codes with '-' df = df[~df["read3"].str.match("^.*-.*$")] #remove codes with '-' - output_path = TRUD_PROCESSED_DIR / 'read3_to_icd10.parquet' + output_path = PROCESSED_PATH / 'read3_to_icd10.parquet' df.to_parquet(output_path, index=False) logger.info(f"Extracted: {output_path}") @@ -238,13 +238,13 @@ def extract_nhs_read_browser(): # dbf = simpledbf.Dbf5('build/maps/downloads/Standard/V3/ICD9V3.DBF') #r3 -> opcs4 - input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V3' / 'OPCS4V3.DBF' + input_path = DOWNLOADS_PATH / 'Standard' / 'V3' / 'OPCS4V3.DBF' df = simpledbf.Dbf5(input_path).to_dataframe() df = df[["READ_CODE", "TARG_CODE"]] df = df.rename(columns={"READ_CODE":"read3", "TARG_CODE":"opcs4"}) df = df[~df["opcs4"].str.match("^.*-.*$")] #remove codes with '-' df = df[~df["read3"].str.match("^.*-.*$")] #remove codes with '-' - output_path = TRUD_PROCESSED_DIR / 'read3_to_opcs4.parquet' + output_path = PROCESSED_PATH / 'read3_to_opcs4.parquet' df.to_parquet(output_path, index=False) logger.info(f"Extracted: {output_path}") @@ -253,11 +253,11 @@ def create_map_directories(): # Check if build directory exists create_map_dirs = False - if TRUD_PATH.exists(): - user_input = input(f"The map directory {TRUD_PATH} already exists. Do you want to download and process trud data again? (y/n): ").strip().lower() + if VOCAB_PATH.exists(): + user_input = input(f"The map directory {VOCAB_PATH} already exists. Do you want to download and process trud data again? (y/n): ").strip().lower() if user_input == "y": # delete all build files - shutil.rmtree(TRUD_PATH) + shutil.rmtree(VOCAB_PATH) create_map_dirs = True elif user_input == "n": logger.info("Exiting TRUD installation") @@ -267,9 +267,9 @@ def create_map_directories(): if create_map_dirs: # create maps directories - TRUD_PATH.mkdir(parents=True, exist_ok=True) - TRUD_DOWNLOADS_DIR.mkdir(parents=True, exist_ok=True) - TRUD_PROCESSED_DIR.mkdir(parents=True,exist_ok=True) + VOCAB_PATH.mkdir(parents=True, exist_ok=True) + DOWNLOADS_PATH.mkdir(parents=True, exist_ok=True) + PROCESSED_PATH.mkdir(parents=True,exist_ok=True) def install(): logger.info(f"Installing TRUD")