From 43692665e52622b26cc6a109cad8682fe695cc87 Mon Sep 17 00:00:00 2001 From: Michael Boniface <m.j.boniface@soton.ac.uk> Date: Tue, 18 Feb 2025 10:15:47 +0000 Subject: [PATCH] moved trud api key to an environment variable; stored trud versions in file trud_version.json to keep track of the source mappings when we commit to a repo --- README.md | 26 +++++++++ acmc.py | 3 +- trud.py | 171 ++++++++++++++++++++++++++++-------------------------- 3 files changed, 117 insertions(+), 83 deletions(-) diff --git a/README.md b/README.md index c2dd4da..0029a7a 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,32 @@ The tool supports verification and mapping across diagnostic coding formats belo - [**ICD-10:**](https://icd.who.int/browse10/2019/en) International Classification of Diseases (ICD) is a medical classification list from the World Health Organization (WHO) and widely used in hospital settings, e.g. Hospital Episode Statistics (HES). - [**ATC Codes:**](https://www.who.int/tools/atc-ddd-toolkit/atc-classification) Anatomical Therapeutic Chemical (ATC) Classification is a drug classification list from the World Health Organization (WHO) +## Notes + +Linux/macOS: + +``` +export ACMC_TRUD_API_KEY="your_api_key" +export ACMC_GITLAB_PAT="your_personal_access_token" +export ACMC_GITHUB_PAT="your_personal_access_token" +``` + +Windows (Command prompt): + +``` +set ACMC_TRUD_API_KEY=your_api_key +set ACMC_GITLAB_PAT=your_personal_access_token +set ACMC_GITHUB_PAT=your_personal_access_token +``` + +Windows (Powershell): + +``` +$env:ACMC_TRUD_API_KEY="your_api_key" +$env:ACMC_GITLAB_PAT="your_personal_access_token" +$env:ACMC_GITHUB_PAT="your_personal_access_token" +``` + ## Installation **1. Setup Conda Enviroment** diff --git a/acmc.py b/acmc.py index 6ecf035..e79b7f9 100644 --- a/acmc.py +++ b/acmc.py @@ -10,7 +10,7 @@ BUILD_PATH = Path('build') def trud_install(args): """Handle the `trud install` command.""" - trud.install(args.api_key) + trud.install() def omop_install(args): """Handle the `omop install` command.""" @@ -72,7 +72,6 @@ def main(): # trud install trud_install_parser = trud_subparsers.add_parser("install", help="Install TRUD components") - trud_install_parser.add_argument("-k", "--api-key", required=True, help="TRUD API Key") trud_install_parser.set_defaults(func=trud_install) ### OMOP Command ### diff --git a/trud.py b/trud.py index 0b28b51..a491684 100644 --- a/trud.py +++ b/trud.py @@ -15,19 +15,15 @@ import simpledbf # Constants FQDN = "isd.digital.nhs.uk" -TRUD_DIR = Path('./build/trud') -TRUD_DOWNLOADS_DIR = TRUD_DIR / 'downloads' -TRUD_PROCESSED_DIR = TRUD_DIR / 'processed' +TRUD_PATH = Path('./build/trud') +TRUD_VERSION_PATH = TRUD_PATH / 'trud_version.json' +TRUD_DOWNLOADS_DIR = TRUD_PATH / 'downloads' +TRUD_PROCESSED_DIR = TRUD_PATH / 'processed' def error_exit(message): print(message, "error") sys.exit(1) -def validate_api_key(api_key): - """Validate that the API key is 40-character hexadecimal.""" - if not api_key or len(api_key) != 40 or not all(c in "0123456789abcdef" for c in api_key.lower()): - error_exit("Invalid API key format. Expected a 40-character hexadecimal string.") - def get_releases(item_id, API_KEY, latest=False): """Retrieve release information for an item from the TRUD API.""" url = f"https://{FQDN}/trud/api/v1/keys/{API_KEY}/items/{item_id}/releases" @@ -255,11 +251,11 @@ def create_map_directories(): # Check if build directory exists create_map_dirs = False - if TRUD_DIR.exists(): - user_input = input(f"The map directory {TRUD_DIR} already exists. Do you want to download and process trud data again? (y/n): ").strip().lower() + if TRUD_PATH.exists(): + user_input = input(f"The map directory {TRUD_PATH} already exists. Do you want to download and process trud data again? (y/n): ").strip().lower() if user_input == "y": # delete all build files - shutil.rmtree(TRUD_DIR) + shutil.rmtree(TRUD_PATH) create_map_dirs = True elif user_input == "n": print("Exiting TRUD installation") @@ -269,76 +265,89 @@ def create_map_directories(): if create_map_dirs: # create maps directories - TRUD_DIR.mkdir(parents=True, exist_ok=True) + TRUD_PATH.mkdir(parents=True, exist_ok=True) TRUD_DOWNLOADS_DIR.mkdir(parents=True, exist_ok=True) TRUD_PROCESSED_DIR.mkdir(parents=True,exist_ok=True) -def install(api_key): - print(f"Installing TRUD") - create_map_directories() - - items_latest = True - items = [ - { - "id": 259, - "name": "NHS ICD-10 5th Edition XML data files", - "hash": "A4F7BBA6E86349AADD0F4696C5E91152EB99CC06121427FC359160439B9F883F", - "extract": extract_icd10, - }, - { - "id": 119, - "name": "OPCS-4 data files", - "hash": "0615A2BF43FFEF94517F1D1E0C05493B627839F323F22C52CBCD8B40BF767CD3", - "extract": extract_opsc4, - }, - { - "id": 9, - "name": "NHS Data Migration", - "hash": "D4317B3ADBA6E1247CF17F0B7CD2B8850FD36C0EA2923BF684EA6159F3A54765", - "extract": extract_nhs_data_migrations, - }, - { - "id": 8, - "name": "NHS Read Browser", - "hash": "1FFF2CBF11D0E6D7FC6CC6F13DD52D2F459095C3D83A3F754E6C359F16913C5E", - "extract": extract_nhs_read_browser, - }, - # TODO: Download BNF from separate site? https://www.nhsbsa.nhs.uk/sites/default/files/2024-10/BNF%20Snomed%20Mapping%20data%2020241016.zip - ] - - # Validate and process each item ID - for item in items: - item_id = item["id"] - print(bcolors.HEADER, "---"+item["name"]+"---", bcolors.ENDC) - - releases = get_releases(item_id, API_KEY=api_key, latest=items_latest) - if not releases: - error_exit(f"No releases found for item {item_id}.") - - # Process each release in reverse order - for release_ordinal, release in enumerate(releases[::-1], 1): - # Download archive file - file_destination = download_release_file(item_id, release_ordinal, release, "archive") - - # Optional files - # if items.checksum: - # download_release_file(item["id"], release_ordinal, release, "checksum") - # if items.signature: - # download_release_file(item["id"], release_ordinal, release, "signature") - # if items.public_key: - # download_release_file(item["id"], release_ordinal, release, "publicKey", "public key") - - #Verify Hash if available - if "hash" in item: - validate_download_hash(file_destination, item["hash"]) - - #Unzip downloaded .zip - unzip_download(file_destination) - - #Extract Tables to parquet - if "extract" in item: - item["extract"]() - - print(f"Downloaded {release_ordinal} release(s) for item {item_id}.") - - print(f"TRUD installation completed") \ No newline at end of file +def install(): + print(f"Installing TRUD") + # get TRUD api key from environment variable + api_key = os.getenv("ACMC_TRUD_API_KEY") + if not api_key: + raise ValueError("TRUD API KEY not found. Set the ACMC_TRUD_API_KEY environment variable.") + + create_map_directories() + + items_latest = True + items = [ + { + "id": 259, + "name": "NHS ICD-10 5th Edition XML data files", + "hash": "A4F7BBA6E86349AADD0F4696C5E91152EB99CC06121427FC359160439B9F883F", + "extract": extract_icd10, + }, + { + "id": 119, + "name": "OPCS-4 data files", + "hash": "0615A2BF43FFEF94517F1D1E0C05493B627839F323F22C52CBCD8B40BF767CD3", + "extract": extract_opsc4, + }, + { + "id": 9, + "name": "NHS Data Migration", + "hash": "D4317B3ADBA6E1247CF17F0B7CD2B8850FD36C0EA2923BF684EA6159F3A54765", + "extract": extract_nhs_data_migrations, + }, + { + "id": 8, + "name": "NHS Read Browser", + "hash": "1FFF2CBF11D0E6D7FC6CC6F13DD52D2F459095C3D83A3F754E6C359F16913C5E", + "extract": extract_nhs_read_browser, + } + # TODO: Download BNF from separate site? https://www.nhsbsa.nhs.uk/sites/default/files/2024-10/BNF%20Snomed%20Mapping%20data%2020241016.zip + ] + + # save TRUD versions to file to main record of what was downloaded + with open(TRUD_VERSION_PATH, "w", encoding="utf-8") as f: + # remove function from items + data = [{k: v for k, v in d.items() if k != "extract"} for d in items] + json.dump(data, f, indent=4) + + # Validate and process each item ID + for item in items: + item_id = item["id"] + print(bcolors.HEADER, "---"+item["name"]+"---", bcolors.ENDC) + + releases = get_releases(item_id, API_KEY=api_key, latest=items_latest) + if not releases: + error_exit(f"No releases found for item {item_id}.") + + # Process each release in reverse order + for release_ordinal, release in enumerate(releases[::-1], 1): + # Download archive file + file_destination = download_release_file(item_id, release_ordinal, release, "archive") + + # Optional files + # if items.checksum: + # download_release_file(item["id"], release_ordinal, release, "checksum") + # if items.signature: + # download_release_file(item["id"], release_ordinal, release, "signature") + # if items.public_key: + # download_release_file(item["id"], release_ordinal, release, "publicKey", "public key") + + #Verify Hash if available + if "hash" in item: + validate_download_hash(file_destination, item["hash"]) + + #Unzip downloaded .zip + unzip_download(file_destination) + + #Extract Tables to parquet + if "extract" in item: + item["extract"]() + + print(f"Downloaded {release_ordinal} release(s) for item {item_id}.") + + + + print(f"TRUD installation completed") \ No newline at end of file -- GitLab