From 7389291c15ec5970f9887fbb9468f63221ec974c Mon Sep 17 00:00:00 2001 From: Jakub Dylag <jjd1c23@soton.ac.uk> Date: Fri, 1 Nov 2024 10:52:09 +0000 Subject: [PATCH] Auto Download and Extract TRUD Code Tables --- README.md | 18 ++-- trud_api.py | 283 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 291 insertions(+), 10 deletions(-) create mode 100644 trud_api.py diff --git a/README.md b/README.md index fce8669..f9efe2b 100644 --- a/README.md +++ b/README.md @@ -42,8 +42,6 @@ The output code list is then used by data providers to select MELD-B cohorts. | SNOMED | NHS TRUD | | | OPCS4 | NHS TRUD | | | ATC | None | | -| MED | None | | -| CPRD Product | None | | MELD-B refers to various diagnostic code formats included in target datasets. * Read V2 @@ -57,19 +55,19 @@ MELD-B refers to various diagnostic code formats included in target datasets. ## ⚙️ Setup -- Delete corrupted files that cannot be read with `bash import.sh` - ### Code Translation Tables -1. Due to the licencing of NHS TRUD coding tables, the following resources <mark>must be downloaded separately</mark>: +1. Due to the licencing of NHS TRUD resources, you <mark>MUST first [Sign Up](https://isd.digital.nhs.uk/trud/user/guest/filters/0/account/form) to NHS TRUD and accept the following licences</mark>: - [nhs_readbrowser_25.0.0_20180401000001](https://isd.digital.nhs.uk/trud/users/guest/filters/2/categories/9/items/8/releases) - [nhs_datamigration_29.0.0_20200401000001](https://isd.digital.nhs.uk/trud/users/guest/filters/0/categories/9/items/9/releases) - - [ICD10_Edition5_XML_20160401](https://isd.digital.nhs.uk/trud/users/guest/filters/0/categories/28/items/258/releases?source=summary) + - [ICD10_Edition5_XML_20160401](https://isd.digital.nhs.uk/trud/users/authenticated/filters/0/categories/28/items/259/releases) - [OPCS-4.10 Data files](https://isd.digital.nhs.uk/trud/users/guest/filters/0/categories/10/items/119/releases) - - [BNF/Snomed Mapping data.xlsx](https://www.nhsbsa.nhs.uk/prescription-data/understanding-our-data/bnf-snomed-mapping) + <!-- - [BNF/Snomed Mapping data.xlsx](https://www.nhsbsa.nhs.uk/prescription-data/understanding-our-data/bnf-snomed-mapping) --> + +2. Once all licences are accepted, get your [API Key](https://isd.digital.nhs.uk/trud/users/authenticated/filters/0/account/manage) for NHS TRUD. -2. Next, prepare the convertion Tables by saving them as `.parquet` tables. - - See "Mappings" section in process_codes_WP.ipynb to generate table with appropriate name - - For reversible convertions create a duplicate table with the name reversed. However be aware this is <b>NOT ADVISED</b> and goes against NHS guidance. +3. Finally, run the automated extraction script, inputting your API Key to granty temporary access to the resources above. Use the command `python trud_api.py --key <INSERT KEY>` (replacing your key in the marked area). + - The convertion Tables will be saved as `.parquet` tables in the folder `maps/processed/`. + - NHS TRUD defines one-way mappings and does <b>NOT ADVISE</b> reversing the mappings. If you still wish to reverse these into two-way mappings, duplicate the given `.parquet` table and reverse the filename (e.g. `read2_code_to_snomed_code.parquet` to `snomed_code_to_read2_code.parquet`) ### JSON phenotype mapping diff --git a/trud_api.py b/trud_api.py new file mode 100644 index 0000000..a7eade3 --- /dev/null +++ b/trud_api.py @@ -0,0 +1,283 @@ +import os +import sys +import requests +import json +import argparse +from pathlib import Path + +from base import bcolors + +import hashlib +import zipfile +import pandas as pd +import simpledbf + +# Constants +FQDN = "isd.digital.nhs.uk" + +def error_exit(message): + print(message, "error") + sys.exit(1) + +def validate_api_key(api_key): + """Validate that the API key is 40-character hexadecimal.""" + if not api_key or len(api_key) != 40 or not all(c in "0123456789abcdef" for c in api_key.lower()): + error_exit("Invalid API key format. Expected a 40-character hexadecimal string.") + +def get_releases(item_id, API_KEY, latest=False): + """Retrieve release information for an item from the TRUD API.""" + url = f"https://{FQDN}/trud/api/v1/keys/{API_KEY}/items/{item_id}/releases" + if latest: + url += "?latest" + response = requests.get(url) + if response.status_code != 200: + error_exit(f"Failed to fetch releases for item {item_id}. Status code: {response.status_code}") + + data = response.json() + if data.get("message") != "OK": + error_exit(data.get("message", "Unknown error occurred")) + + return data.get("releases", []) + +def download_release_file(item_id, release_ordinal, release, file_json_prefix, file_type=None, items_folder="maps"): + """Download specified file type for a given release of an item.""" + file_type = file_type or file_json_prefix + file_url = release.get(f"{file_json_prefix}FileUrl") + file_name = release.get(f"{file_json_prefix}FileName") + file_destination = os.path.join(items_folder, file_name) + + if not file_url or not file_name: + error_exit(f"Missing {file_type} file information for release {release_ordinal} of item {item_id}.") + + print(f"Downloading item {item_id} {file_type} file: {file_name}") + response = requests.get(file_url, stream=True) + + if response.status_code == 200: + with open(file_destination, "wb") as f: + f.write(response.content) + return file_destination + else: + error_exit(f"Failed to download {file_type} file for item {item_id}. Status code: {response.status_code}") + +def validate_download_hash(file_destination:str, item_hash:str): + with open(file_destination, "rb") as f: + hash = hashlib.sha256(f.read()).hexdigest() + print(hash) + if hash.upper() == item_hash.upper(): + print(f"Verified hash of {file_destination} {hash}") + else: + error_exit(f"Could not validate origin of {file_destination}. The SHA-256 hash should be: {item_hash}, but got {hash} instead") + +def unzip_download(file_destination:str, items_folder="maps"): + with zipfile.ZipFile(file_destination, 'r') as zip_ref: + zip_ref.extractall(items_folder) + +def extract_icd10(): + #ICD10_edition5 + df = pd.read_xml("maps/ICD10_Edition5_XML_20160401/Content/ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml",) + df = df[["CODE", "ALT_CODE", "DESCRIPTION"]] + df = df.rename(columns={"CODE":"icd10_code", + "ALT_CODE":"icd10_alt_code", + "DESCRIPTION":"description" + }) + df.to_parquet("maps/processed/icd10_code.parquet", index=False) + print("Extracted ", "maps/processed/icd10_code.parquet") + +def extract_opsc4(): + df = pd.read_csv("maps/OPCS410 Data files txt/OPCS410 CodesAndTitles Nov 2022 V1.0.txt", sep='\t', dtype=str, header=None) + df = df.rename(columns={0:"opcs4_code", 1:"description"}) + df.to_parquet("maps/processed/opcs4_code.parquet", index=False) + print("Extracted ", "maps/processed/opcs4_code.parquet") + +def extract_nhs_data_migrations(): + #NHS Data Migrations + + #snomed only + df = pd.read_csv('maps/Mapping Tables/Updated/Clinically Assured/sctcremap_uk_20200401000001.txt', sep='\t') + df = df[["SCT_CONCEPTID"]] + df = df.rename(columns={"SCT_CONCEPTID":"snomed_code"}) + df = df.drop_duplicates() + df = df.astype(str) + df.to_parquet("maps/processed/snomed_code.parquet", index=False) + print("Extracted ", "maps/processed/snomed_code.parquet") + + #r2 -> r3 + df = pd.read_csv('maps/Mapping Tables/Updated/Clinically Assured/rctctv3map_uk_20200401000001.txt', sep='\t') + df = df[["V2_CONCEPTID", "CTV3_CONCEPTID"]] + df = df.rename(columns={"V2_CONCEPTID":"read2_code", + "CTV3_CONCEPTID":"read3_code"}) + df.to_parquet("maps/processed/read2_code_to_read3_code.parquet", index=False) + print("Extracted ", "maps/processed/read2_code_to_read3_code.parquet") + + #r3->r2 + df = pd.read_csv('maps/Mapping Tables/Updated/Clinically Assured/ctv3rctmap_uk_20200401000002.txt', sep='\t') + df = df[["CTV3_CONCEPTID", "V2_CONCEPTID"]] + df = df.rename(columns={"CTV3_CONCEPTID":"read3_code", + "V2_CONCEPTID":"read2_code"}) + df = df.drop_duplicates() + df = df[~df["read2_code"].str.match("^.*_.*$")] #remove r2 codes with '_' + df.to_parquet("maps/processed/read3_code_to_read2_code.parquet", index=False) + print("Extracted ", "maps/processed/read3_code_to_read2_code.parquet") + + #r2 -> snomed + df = pd.read_csv('maps/Mapping Tables/Updated/Clinically Assured/rcsctmap2_uk_20200401000001.txt', sep='\t', dtype=str) + df = df[["ReadCode", "ConceptId"]] + df = df.rename(columns={"ReadCode":"read2_code", + "ConceptId":"snomed_code"}) + df.to_parquet("maps/processed/read2_code_to_snomed_code.parquet", index=False) + print("Extracted ", "maps/processed/read2_code_to_snomed_code.parquet") + + #r3->snomed + df = pd.read_csv('maps/Mapping Tables/Updated/Clinically Assured/ctv3sctmap2_uk_20200401000001.txt', sep='\t') + df = df[["CTV3_TERMID", "SCT_CONCEPTID"]] + df = df.rename(columns={"CTV3_TERMID":"read3_code", + "SCT_CONCEPTID":"snomed_code"}) + df["snomed_code"] = df["snomed_code"].astype(str) + df = df[~df["snomed_code"].str.match("^.*_.*$")] #remove snomed codes with '_' + df.to_parquet("maps/processed/read3_code_to_snomed_code.parquet", index=False) + print("Extracted ", "maps/processed/read3_code_to_snomed_code.parquet") + +def extract_nhs_read_browser(): + #r2 only + df = simpledbf.Dbf5('maps/Standard/V2/ANCESTOR.DBF').to_dataframe() + df = pd.concat([df['READCODE'], df['DESCENDANT']]) + df = pd.DataFrame(df.drop_duplicates()) + df = df.rename(columns={0:"read2_code"}) + df.to_parquet("maps/processed/read2_code.parquet", index=False) + print("Extracted ", "maps/processed/read2_code.parquet") + + #r2 -> atc + df = simpledbf.Dbf5('maps/Standard/V2/ATC.DBF').to_dataframe() + df = df[["READCODE", "ATC"]] + df = df.rename(columns={"READCODE":"read2_code", "ATC":"atc_code"}) + df.to_parquet("maps/processed/read2_code_to_atc_code.parquet", index=False) + print("Extracted ", "maps/processed/read2_code_to_atc_code.parquet") + + #r2 -> icd10 + df = simpledbf.Dbf5('maps/Standard/V2/ICD10.DBF').to_dataframe() + df = df[["READ_CODE", "TARG_CODE"]] + df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"icd10_code"}) + df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-' + df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-' + df.to_parquet("maps/processed/read2_code_to_icd10_code.parquet", index=False) + print("Extracted ", "maps/processed/read2_code_to_icd10_code.parquet") + + #r2 -> opcs4 + df = simpledbf.Dbf5('maps/Standard/V2/OPCS4V3.DBF').to_dataframe() + df = df[["READ_CODE", "TARG_CODE"]] + df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"opcs4_code"}) + df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-' + df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-' + df.to_parquet("maps/processed/read2_code_to_opcs4_code.parquet", index=False) + print("Extracted ", "maps/processed/read2_code_to_opcs4_code.parquet") + + #r3 only + df = simpledbf.Dbf5('maps/Standard/V3/ANCESTOR.DBF').to_dataframe() + df = pd.concat([df['READCODE'], df['DESCENDANT']]) + df = pd.DataFrame(df.drop_duplicates()) + df = df.rename(columns={0:"read3_code"}) + df.to_parquet("maps/processed/read3_code.parquet", index=False) + print("Extracted ", "maps/processed/read3_code.parquet") + + #r3 -> icd10 + df = simpledbf.Dbf5('maps/Standard/V3/ICD10.DBF').to_dataframe() + df = df[["READ_CODE", "TARG_CODE"]] + df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"icd10_code"}) + df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-' + df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-' + df.to_parquet("maps/processed/read3_code_to_icd10_code.parquet", index=False) + print("Extracted ", "maps/processed/read3_code_to_icd10_code.parquet") + + #r3 -> icd9 + # dbf = simpledbf.Dbf5('maps/Standard/V3/ICD9V3.DBF') + + #r3 -> opcs4 + df = simpledbf.Dbf5('maps/Standard/V3/OPCS4V3.DBF').to_dataframe() + df = df[["READ_CODE", "TARG_CODE"]] + df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"opcs4_code"}) + df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-' + df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-' + df.to_parquet("maps/processed/read3_code_to_opcs4_code.parquet", index=False) + print("Extracted ", "maps/processed/read3_code_to_opcs4_code.parquet") + +def main(): + parser = argparse.ArgumentParser( + description="Download releases of items using the TRUD API.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--key", type=str, help="TRUD API Key") +# parser.add_argument("item_ids", nargs="+", help="Item IDs to download releases for.") +# parser.add_argument("-l", "--latest", action="store_true", help="Download only the latest release") +# parser.add_argument("-c", "--checksum", action="store_true", help="Also download the checksum file") +# parser.add_argument("-s", "--signature", action="store_true", help="Also download the signature file") +# parser.add_argument("-p", "--public_key", action="store_true", help="Also download the public key file") + + args = parser.parse_args() + + items_latest = True + items_folder = "maps" + items = [ + { + "id": 259, + "name": "NHS ICD-10 5th Edition XML data files", + "hash": "A4F7BBA6E86349AADD0F4696C5E91152EB99CC06121427FC359160439B9F883F", + "extract": extract_icd10, + }, + { + "id": 119, + "name": "OPCS-4 data files", + "hash": "0615A2BF43FFEF94517F1D1E0C05493B627839F323F22C52CBCD8B40BF767CD3", + "extract": extract_opsc4, + }, + { + "id": 9, + "name": "NHS Data Migration", + "hash": "D4317B3ADBA6E1247CF17F0B7CD2B8850FD36C0EA2923BF684EA6159F3A54765", + "extract": extract_nhs_data_migrations, + }, + { + "id": 8, + "name": "NHS Read Browser", + "hash": "1FFF2CBF11D0E6D7FC6CC6F13DD52D2F459095C3D83A3F754E6C359F16913C5E", + "extract": extract_nhs_read_browser, + }, + # TODO: Download BNF from separate site? https://www.nhsbsa.nhs.uk/sites/default/files/2024-10/BNF%20Snomed%20Mapping%20data%2020241016.zip + ] + + # Validate and process each item ID + for item in items: + item_id = item["id"] + print(bcolors.HEADER, "---"+item["name"]+"---", bcolors.ENDC) + + releases = get_releases(item_id, API_KEY=args.key, latest=items_latest) + if not releases: + error_exit(f"No releases found for item {item_id}.") + + # Process each release in reverse order + for release_ordinal, release in enumerate(releases[::-1], 1): + # Download archive file + file_destination = download_release_file(item_id, release_ordinal, release, "archive", items_folder=items_folder) + + # Optional files + # if items.checksum: + # download_release_file(item["id"], release_ordinal, release, "checksum") + # if items.signature: + # download_release_file(item["id"], release_ordinal, release, "signature") + # if items.public_key: + # download_release_file(item["id"], release_ordinal, release, "publicKey", "public key") + + #Verify Hash if available + if "hash" in item: + validate_download_hash(file_destination, item["hash"]) + + #Unzip downloaded .zip + unzip_download(file_destination, items_folder=items_folder) + + #Extract Tables to parquet + if "extract" in item: + item["extract"]() + + print(f"Downloaded {release_ordinal} release(s) for item {item_id}.") + +if __name__ == "__main__": + main() -- GitLab