Skip to content
Snippets Groups Projects
Commit 7389291c authored by Jakub Dylag's avatar Jakub Dylag
Browse files

Auto Download and Extract TRUD Code Tables

parent 4929dd79
No related branches found
No related tags found
No related merge requests found
...@@ -42,8 +42,6 @@ The output code list is then used by data providers to select MELD-B cohorts. ...@@ -42,8 +42,6 @@ The output code list is then used by data providers to select MELD-B cohorts.
| SNOMED | NHS TRUD | | | SNOMED | NHS TRUD | |
| OPCS4 | NHS TRUD | | | OPCS4 | NHS TRUD | |
| ATC | None | | | ATC | None | |
| MED | None | |
| CPRD Product | None | |
MELD-B refers to various diagnostic code formats included in target datasets. MELD-B refers to various diagnostic code formats included in target datasets.
* Read V2 * Read V2
...@@ -57,19 +55,19 @@ MELD-B refers to various diagnostic code formats included in target datasets. ...@@ -57,19 +55,19 @@ MELD-B refers to various diagnostic code formats included in target datasets.
## ⚙️ Setup ## ⚙️ Setup
- Delete corrupted files that cannot be read with `bash import.sh`
### Code Translation Tables ### Code Translation Tables
1. Due to the licencing of NHS TRUD coding tables, the following resources <mark>must be downloaded separately</mark>: 1. Due to the licencing of NHS TRUD resources, you <mark>MUST first [Sign Up](https://isd.digital.nhs.uk/trud/user/guest/filters/0/account/form) to NHS TRUD and accept the following licences</mark>:
- [nhs_readbrowser_25.0.0_20180401000001](https://isd.digital.nhs.uk/trud/users/guest/filters/2/categories/9/items/8/releases) - [nhs_readbrowser_25.0.0_20180401000001](https://isd.digital.nhs.uk/trud/users/guest/filters/2/categories/9/items/8/releases)
- [nhs_datamigration_29.0.0_20200401000001](https://isd.digital.nhs.uk/trud/users/guest/filters/0/categories/9/items/9/releases) - [nhs_datamigration_29.0.0_20200401000001](https://isd.digital.nhs.uk/trud/users/guest/filters/0/categories/9/items/9/releases)
- [ICD10_Edition5_XML_20160401](https://isd.digital.nhs.uk/trud/users/guest/filters/0/categories/28/items/258/releases?source=summary) - [ICD10_Edition5_XML_20160401](https://isd.digital.nhs.uk/trud/users/authenticated/filters/0/categories/28/items/259/releases)
- [OPCS-4.10 Data files](https://isd.digital.nhs.uk/trud/users/guest/filters/0/categories/10/items/119/releases) - [OPCS-4.10 Data files](https://isd.digital.nhs.uk/trud/users/guest/filters/0/categories/10/items/119/releases)
- [BNF/Snomed Mapping data.xlsx](https://www.nhsbsa.nhs.uk/prescription-data/understanding-our-data/bnf-snomed-mapping) <!-- - [BNF/Snomed Mapping data.xlsx](https://www.nhsbsa.nhs.uk/prescription-data/understanding-our-data/bnf-snomed-mapping) -->
2. Once all licences are accepted, get your [API Key](https://isd.digital.nhs.uk/trud/users/authenticated/filters/0/account/manage) for NHS TRUD.
2. Next, prepare the convertion Tables by saving them as `.parquet` tables. 3. Finally, run the automated extraction script, inputting your API Key to granty temporary access to the resources above. Use the command `python trud_api.py --key <INSERT KEY>` (replacing your key in the marked area).
- See "Mappings" section in process_codes_WP.ipynb to generate table with appropriate name - The convertion Tables will be saved as `.parquet` tables in the folder `maps/processed/`.
- For reversible convertions create a duplicate table with the name reversed. However be aware this is <b>NOT ADVISED</b> and goes against NHS guidance. - NHS TRUD defines one-way mappings and does <b>NOT ADVISE</b> reversing the mappings. If you still wish to reverse these into two-way mappings, duplicate the given `.parquet` table and reverse the filename (e.g. `read2_code_to_snomed_code.parquet` to `snomed_code_to_read2_code.parquet`)
### JSON phenotype mapping ### JSON phenotype mapping
......
import os
import sys
import requests
import json
import argparse
from pathlib import Path
from base import bcolors
import hashlib
import zipfile
import pandas as pd
import simpledbf
# Constants
FQDN = "isd.digital.nhs.uk"
def error_exit(message):
print(message, "error")
sys.exit(1)
def validate_api_key(api_key):
"""Validate that the API key is 40-character hexadecimal."""
if not api_key or len(api_key) != 40 or not all(c in "0123456789abcdef" for c in api_key.lower()):
error_exit("Invalid API key format. Expected a 40-character hexadecimal string.")
def get_releases(item_id, API_KEY, latest=False):
"""Retrieve release information for an item from the TRUD API."""
url = f"https://{FQDN}/trud/api/v1/keys/{API_KEY}/items/{item_id}/releases"
if latest:
url += "?latest"
response = requests.get(url)
if response.status_code != 200:
error_exit(f"Failed to fetch releases for item {item_id}. Status code: {response.status_code}")
data = response.json()
if data.get("message") != "OK":
error_exit(data.get("message", "Unknown error occurred"))
return data.get("releases", [])
def download_release_file(item_id, release_ordinal, release, file_json_prefix, file_type=None, items_folder="maps"):
"""Download specified file type for a given release of an item."""
file_type = file_type or file_json_prefix
file_url = release.get(f"{file_json_prefix}FileUrl")
file_name = release.get(f"{file_json_prefix}FileName")
file_destination = os.path.join(items_folder, file_name)
if not file_url or not file_name:
error_exit(f"Missing {file_type} file information for release {release_ordinal} of item {item_id}.")
print(f"Downloading item {item_id} {file_type} file: {file_name}")
response = requests.get(file_url, stream=True)
if response.status_code == 200:
with open(file_destination, "wb") as f:
f.write(response.content)
return file_destination
else:
error_exit(f"Failed to download {file_type} file for item {item_id}. Status code: {response.status_code}")
def validate_download_hash(file_destination:str, item_hash:str):
with open(file_destination, "rb") as f:
hash = hashlib.sha256(f.read()).hexdigest()
print(hash)
if hash.upper() == item_hash.upper():
print(f"Verified hash of {file_destination} {hash}")
else:
error_exit(f"Could not validate origin of {file_destination}. The SHA-256 hash should be: {item_hash}, but got {hash} instead")
def unzip_download(file_destination:str, items_folder="maps"):
with zipfile.ZipFile(file_destination, 'r') as zip_ref:
zip_ref.extractall(items_folder)
def extract_icd10():
#ICD10_edition5
df = pd.read_xml("maps/ICD10_Edition5_XML_20160401/Content/ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml",)
df = df[["CODE", "ALT_CODE", "DESCRIPTION"]]
df = df.rename(columns={"CODE":"icd10_code",
"ALT_CODE":"icd10_alt_code",
"DESCRIPTION":"description"
})
df.to_parquet("maps/processed/icd10_code.parquet", index=False)
print("Extracted ", "maps/processed/icd10_code.parquet")
def extract_opsc4():
df = pd.read_csv("maps/OPCS410 Data files txt/OPCS410 CodesAndTitles Nov 2022 V1.0.txt", sep='\t', dtype=str, header=None)
df = df.rename(columns={0:"opcs4_code", 1:"description"})
df.to_parquet("maps/processed/opcs4_code.parquet", index=False)
print("Extracted ", "maps/processed/opcs4_code.parquet")
def extract_nhs_data_migrations():
#NHS Data Migrations
#snomed only
df = pd.read_csv('maps/Mapping Tables/Updated/Clinically Assured/sctcremap_uk_20200401000001.txt', sep='\t')
df = df[["SCT_CONCEPTID"]]
df = df.rename(columns={"SCT_CONCEPTID":"snomed_code"})
df = df.drop_duplicates()
df = df.astype(str)
df.to_parquet("maps/processed/snomed_code.parquet", index=False)
print("Extracted ", "maps/processed/snomed_code.parquet")
#r2 -> r3
df = pd.read_csv('maps/Mapping Tables/Updated/Clinically Assured/rctctv3map_uk_20200401000001.txt', sep='\t')
df = df[["V2_CONCEPTID", "CTV3_CONCEPTID"]]
df = df.rename(columns={"V2_CONCEPTID":"read2_code",
"CTV3_CONCEPTID":"read3_code"})
df.to_parquet("maps/processed/read2_code_to_read3_code.parquet", index=False)
print("Extracted ", "maps/processed/read2_code_to_read3_code.parquet")
#r3->r2
df = pd.read_csv('maps/Mapping Tables/Updated/Clinically Assured/ctv3rctmap_uk_20200401000002.txt', sep='\t')
df = df[["CTV3_CONCEPTID", "V2_CONCEPTID"]]
df = df.rename(columns={"CTV3_CONCEPTID":"read3_code",
"V2_CONCEPTID":"read2_code"})
df = df.drop_duplicates()
df = df[~df["read2_code"].str.match("^.*_.*$")] #remove r2 codes with '_'
df.to_parquet("maps/processed/read3_code_to_read2_code.parquet", index=False)
print("Extracted ", "maps/processed/read3_code_to_read2_code.parquet")
#r2 -> snomed
df = pd.read_csv('maps/Mapping Tables/Updated/Clinically Assured/rcsctmap2_uk_20200401000001.txt', sep='\t', dtype=str)
df = df[["ReadCode", "ConceptId"]]
df = df.rename(columns={"ReadCode":"read2_code",
"ConceptId":"snomed_code"})
df.to_parquet("maps/processed/read2_code_to_snomed_code.parquet", index=False)
print("Extracted ", "maps/processed/read2_code_to_snomed_code.parquet")
#r3->snomed
df = pd.read_csv('maps/Mapping Tables/Updated/Clinically Assured/ctv3sctmap2_uk_20200401000001.txt', sep='\t')
df = df[["CTV3_TERMID", "SCT_CONCEPTID"]]
df = df.rename(columns={"CTV3_TERMID":"read3_code",
"SCT_CONCEPTID":"snomed_code"})
df["snomed_code"] = df["snomed_code"].astype(str)
df = df[~df["snomed_code"].str.match("^.*_.*$")] #remove snomed codes with '_'
df.to_parquet("maps/processed/read3_code_to_snomed_code.parquet", index=False)
print("Extracted ", "maps/processed/read3_code_to_snomed_code.parquet")
def extract_nhs_read_browser():
#r2 only
df = simpledbf.Dbf5('maps/Standard/V2/ANCESTOR.DBF').to_dataframe()
df = pd.concat([df['READCODE'], df['DESCENDANT']])
df = pd.DataFrame(df.drop_duplicates())
df = df.rename(columns={0:"read2_code"})
df.to_parquet("maps/processed/read2_code.parquet", index=False)
print("Extracted ", "maps/processed/read2_code.parquet")
#r2 -> atc
df = simpledbf.Dbf5('maps/Standard/V2/ATC.DBF').to_dataframe()
df = df[["READCODE", "ATC"]]
df = df.rename(columns={"READCODE":"read2_code", "ATC":"atc_code"})
df.to_parquet("maps/processed/read2_code_to_atc_code.parquet", index=False)
print("Extracted ", "maps/processed/read2_code_to_atc_code.parquet")
#r2 -> icd10
df = simpledbf.Dbf5('maps/Standard/V2/ICD10.DBF').to_dataframe()
df = df[["READ_CODE", "TARG_CODE"]]
df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"icd10_code"})
df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-'
df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-'
df.to_parquet("maps/processed/read2_code_to_icd10_code.parquet", index=False)
print("Extracted ", "maps/processed/read2_code_to_icd10_code.parquet")
#r2 -> opcs4
df = simpledbf.Dbf5('maps/Standard/V2/OPCS4V3.DBF').to_dataframe()
df = df[["READ_CODE", "TARG_CODE"]]
df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"opcs4_code"})
df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-'
df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-'
df.to_parquet("maps/processed/read2_code_to_opcs4_code.parquet", index=False)
print("Extracted ", "maps/processed/read2_code_to_opcs4_code.parquet")
#r3 only
df = simpledbf.Dbf5('maps/Standard/V3/ANCESTOR.DBF').to_dataframe()
df = pd.concat([df['READCODE'], df['DESCENDANT']])
df = pd.DataFrame(df.drop_duplicates())
df = df.rename(columns={0:"read3_code"})
df.to_parquet("maps/processed/read3_code.parquet", index=False)
print("Extracted ", "maps/processed/read3_code.parquet")
#r3 -> icd10
df = simpledbf.Dbf5('maps/Standard/V3/ICD10.DBF').to_dataframe()
df = df[["READ_CODE", "TARG_CODE"]]
df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"icd10_code"})
df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-'
df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-'
df.to_parquet("maps/processed/read3_code_to_icd10_code.parquet", index=False)
print("Extracted ", "maps/processed/read3_code_to_icd10_code.parquet")
#r3 -> icd9
# dbf = simpledbf.Dbf5('maps/Standard/V3/ICD9V3.DBF')
#r3 -> opcs4
df = simpledbf.Dbf5('maps/Standard/V3/OPCS4V3.DBF').to_dataframe()
df = df[["READ_CODE", "TARG_CODE"]]
df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"opcs4_code"})
df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-'
df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-'
df.to_parquet("maps/processed/read3_code_to_opcs4_code.parquet", index=False)
print("Extracted ", "maps/processed/read3_code_to_opcs4_code.parquet")
def main():
parser = argparse.ArgumentParser(
description="Download releases of items using the TRUD API.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("--key", type=str, help="TRUD API Key")
# parser.add_argument("item_ids", nargs="+", help="Item IDs to download releases for.")
# parser.add_argument("-l", "--latest", action="store_true", help="Download only the latest release")
# parser.add_argument("-c", "--checksum", action="store_true", help="Also download the checksum file")
# parser.add_argument("-s", "--signature", action="store_true", help="Also download the signature file")
# parser.add_argument("-p", "--public_key", action="store_true", help="Also download the public key file")
args = parser.parse_args()
items_latest = True
items_folder = "maps"
items = [
{
"id": 259,
"name": "NHS ICD-10 5th Edition XML data files",
"hash": "A4F7BBA6E86349AADD0F4696C5E91152EB99CC06121427FC359160439B9F883F",
"extract": extract_icd10,
},
{
"id": 119,
"name": "OPCS-4 data files",
"hash": "0615A2BF43FFEF94517F1D1E0C05493B627839F323F22C52CBCD8B40BF767CD3",
"extract": extract_opsc4,
},
{
"id": 9,
"name": "NHS Data Migration",
"hash": "D4317B3ADBA6E1247CF17F0B7CD2B8850FD36C0EA2923BF684EA6159F3A54765",
"extract": extract_nhs_data_migrations,
},
{
"id": 8,
"name": "NHS Read Browser",
"hash": "1FFF2CBF11D0E6D7FC6CC6F13DD52D2F459095C3D83A3F754E6C359F16913C5E",
"extract": extract_nhs_read_browser,
},
# TODO: Download BNF from separate site? https://www.nhsbsa.nhs.uk/sites/default/files/2024-10/BNF%20Snomed%20Mapping%20data%2020241016.zip
]
# Validate and process each item ID
for item in items:
item_id = item["id"]
print(bcolors.HEADER, "---"+item["name"]+"---", bcolors.ENDC)
releases = get_releases(item_id, API_KEY=args.key, latest=items_latest)
if not releases:
error_exit(f"No releases found for item {item_id}.")
# Process each release in reverse order
for release_ordinal, release in enumerate(releases[::-1], 1):
# Download archive file
file_destination = download_release_file(item_id, release_ordinal, release, "archive", items_folder=items_folder)
# Optional files
# if items.checksum:
# download_release_file(item["id"], release_ordinal, release, "checksum")
# if items.signature:
# download_release_file(item["id"], release_ordinal, release, "signature")
# if items.public_key:
# download_release_file(item["id"], release_ordinal, release, "publicKey", "public key")
#Verify Hash if available
if "hash" in item:
validate_download_hash(file_destination, item["hash"])
#Unzip downloaded .zip
unzip_download(file_destination, items_folder=items_folder)
#Extract Tables to parquet
if "extract" in item:
item["extract"]()
print(f"Downloaded {release_ordinal} release(s) for item {item_id}.")
if __name__ == "__main__":
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment