diff --git a/parse.py b/parse.py index 8d66f305db0dbb1fb11d0e50d9f4405649b966b1..4e97642280065e4b3ecbe6e5b4a3d8b51d2ca175 100644 --- a/parse.py +++ b/parse.py @@ -69,7 +69,7 @@ class Read2_code(Proto_code): def __init__(self, file_path=None): super().__init__(file_path) - input_path = trud.MAPS_PROCESSED_DIR / 'read2_code.parquet' + input_path = trud.TRUD_PROCESSED_DIR / 'read2_code.parquet' if not input_path.is_file(): raise FileNotFoundError(f"Error: Read2 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") self.db = pd.read_parquet(input_path) @@ -115,7 +115,7 @@ class Read3_code(Proto_code): def __init__(self, file_path=None): super().__init__(file_path) - input_path = trud.MAPS_PROCESSED_DIR / 'read3_code.parquet' + input_path = trud.TRUD_PROCESSED_DIR / 'read3_code.parquet' if not input_path.is_file(): raise FileNotFoundError(f"Error: Read3 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") self.db = pd.read_parquet(input_path) @@ -160,7 +160,7 @@ class Icd10_code(Proto_code): def __init__(self, file_path=None): super().__init__(file_path) - input_path = trud.MAPS_PROCESSED_DIR / 'icd10_code.parquet' + input_path = trud.TRUD_PROCESSED_DIR / 'icd10_code.parquet' if not input_path.is_file(): raise FileNotFoundError(f"Error: ICD10 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") self.db = pd.read_parquet(input_path) @@ -223,7 +223,7 @@ class Snomed_code(Proto_code): def __init__(self, file_path=None): super().__init__(file_path) - input_path = trud.MAPS_PROCESSED_DIR / 'snomed_code.parquet' + input_path = trud.TRUD_PROCESSED_DIR / 'snomed_code.parquet' if not input_path.is_file(): raise FileNotFoundError(f"Error: SNOMED code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") self.db = pd.read_parquet(input_path) @@ -280,7 +280,7 @@ class Opcs4_code(Proto_code): def __init__(self, file_path=None): super().__init__(file_path) - input_path = trud.MAPS_PROCESSED_DIR / 'opcs4_code.parquet' + input_path = trud.TRUD_PROCESSED_DIR / 'opcs4_code.parquet' if not input_path.is_file(): raise FileNotFoundError(f"Error: OPCS4 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") self.db = pd.read_parquet(input_path) diff --git a/phen.py b/phen.py index 6e8b2901983f55a87386ab90444ffda1c7c828bf..6ba81e64eb7b9d87db66041ce2535b74aaf4f9e6 100644 --- a/phen.py +++ b/phen.py @@ -191,7 +191,7 @@ def convert_codes(df, target, translate): print(f"target type {target}") for col_name in df.columns[df.columns != target]: filename = f"{col_name}_to_{target}.parquet" - map_path = trud.MAPS_PROCESSED_DIR / filename + map_path = trud.TRUD_PROCESSED_DIR / filename if map_path.exists(): col = df[col_name] df_map = pd.read_parquet(map_path) diff --git a/trud.py b/trud.py index 869aa09b3bb6b85d9a6a3f2f3d25accca8e0cbbc..0b28b514b26dc58b566c5f8c151e617b3d12ef98 100644 --- a/trud.py +++ b/trud.py @@ -15,9 +15,9 @@ import simpledbf # Constants FQDN = "isd.digital.nhs.uk" -MAPS_DIR = Path('./build/trud') -MAPS_DOWNLOADS_DIR = MAPS_DIR / 'downloads' -MAPS_PROCESSED_DIR = MAPS_DIR / 'processed' +TRUD_DIR = Path('./build/trud') +TRUD_DOWNLOADS_DIR = TRUD_DIR / 'downloads' +TRUD_PROCESSED_DIR = TRUD_DIR / 'processed' def error_exit(message): print(message, "error") @@ -44,7 +44,7 @@ def get_releases(item_id, API_KEY, latest=False): return data.get("releases", []) -def download_release_file(item_id, release_ordinal, release, file_json_prefix, file_type=None, items_folder=MAPS_DOWNLOADS_DIR): +def download_release_file(item_id, release_ordinal, release, file_json_prefix, file_type=None, items_folder=TRUD_DOWNLOADS_DIR): """Download specified file type for a given release of an item.""" # check folder is a directory @@ -54,7 +54,7 @@ def download_release_file(item_id, release_ordinal, release, file_json_prefix, f file_type = file_type or file_json_prefix file_url = release.get(f"{file_json_prefix}FileUrl") file_name = release.get(f"{file_json_prefix}FileName") - file_destination = MAPS_DOWNLOADS_DIR / file_name + file_destination = TRUD_DOWNLOADS_DIR / file_name if not file_url or not file_name: error_exit(f"Missing {file_type} file information for release {release_ordinal} of item {item_id}.") @@ -78,7 +78,7 @@ def validate_download_hash(file_destination:str, item_hash:str): else: error_exit(f"Could not validate origin of {file_destination}. The SHA-256 hash should be: {item_hash}, but got {hash} instead") -def unzip_download(file_destination:str, items_folder=MAPS_DOWNLOADS_DIR): +def unzip_download(file_destination:str, items_folder=TRUD_DOWNLOADS_DIR): # check folder is a directory if not items_folder.is_dir(): @@ -89,24 +89,24 @@ def unzip_download(file_destination:str, items_folder=MAPS_DOWNLOADS_DIR): def extract_icd10(): #ICD10_edition5 - file_path = MAPS_DOWNLOADS_DIR / 'ICD10_Edition5_XML_20160401' / 'Content' / 'ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml' + file_path = TRUD_DOWNLOADS_DIR / 'ICD10_Edition5_XML_20160401' / 'Content' / 'ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml' df = pd.read_xml(file_path) df = df[["CODE", "ALT_CODE", "DESCRIPTION"]] df = df.rename(columns={"CODE":"icd10_code", "ALT_CODE":"icd10_alt_code", "DESCRIPTION":"description" }) - output_path = MAPS_PROCESSED_DIR / 'icd10_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'icd10_code.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") def extract_opsc4(): - file_path = MAPS_DOWNLOADS_DIR / 'OPCS410 Data files txt' / 'OPCS410 CodesAndTitles Nov 2022 V1.0.txt' + file_path = TRUD_DOWNLOADS_DIR / 'OPCS410 Data files txt' / 'OPCS410 CodesAndTitles Nov 2022 V1.0.txt' df = pd.read_csv(file_path, sep='\t', dtype=str, header=None) df = df.rename(columns={0:"opcs4_code", 1:"description"}) - output_path = MAPS_PROCESSED_DIR / 'opcs4_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'opcs4_code.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") @@ -114,30 +114,30 @@ def extract_nhs_data_migrations(): #NHS Data Migrations #snomed only - file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'sctcremap_uk_20200401000001.txt' + file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'sctcremap_uk_20200401000001.txt' df = pd.read_csv(file_path, sep='\t') df = df[["SCT_CONCEPTID"]] df = df.rename(columns={"SCT_CONCEPTID":"snomed_code"}) df = df.drop_duplicates() df = df.astype(str) - output_path = MAPS_PROCESSED_DIR / 'snomed_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'snomed_code.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") #r2 -> r3 - file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rctctv3map_uk_20200401000001.txt' + file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rctctv3map_uk_20200401000001.txt' df = pd.read_csv(file_path, sep='\t') df = df[["V2_CONCEPTID", "CTV3_CONCEPTID"]] df = df.rename(columns={"V2_CONCEPTID":"read2_code", "CTV3_CONCEPTID":"read3_code"}) - output_path = MAPS_PROCESSED_DIR / 'read2_code_to_read3_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'read2_code_to_read3_code.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") #r3->r2 - file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3rctmap_uk_20200401000002.txt' + file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3rctmap_uk_20200401000002.txt' df = pd.read_csv(file_path, sep='\t') df = df[["CTV3_CONCEPTID", "V2_CONCEPTID"]] df = df.rename(columns={"CTV3_CONCEPTID":"read3_code", @@ -145,23 +145,23 @@ def extract_nhs_data_migrations(): df = df.drop_duplicates() df = df[~df["read2_code"].str.match("^.*_.*$")] #remove r2 codes with '_' - output_path = MAPS_PROCESSED_DIR / 'read3_code_to_read2_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'read3_code_to_read2_code.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") #r2 -> snomed - file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rcsctmap2_uk_20200401000001.txt' + file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rcsctmap2_uk_20200401000001.txt' df = pd.read_csv(file_path, sep='\t', dtype=str) df = df[["ReadCode", "ConceptId"]] df = df.rename(columns={"ReadCode":"read2_code", "ConceptId":"snomed_code"}) - output_path = MAPS_PROCESSED_DIR / 'read2_code_to_snomed_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'read2_code_to_snomed_code.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") #r3->snomed - file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3sctmap2_uk_20200401000001.txt' + file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3sctmap2_uk_20200401000001.txt' df = pd.read_csv(file_path, sep='\t', dtype=str) df = df[["CTV3_TERMID", "SCT_CONCEPTID"]] df = df.rename(columns={"CTV3_TERMID":"read3_code", @@ -169,70 +169,70 @@ def extract_nhs_data_migrations(): df["snomed_code"] = df["snomed_code"].astype(str) df = df[~df["snomed_code"].str.match("^.*_.*$")] #remove snomed codes with '_' - output_path = MAPS_PROCESSED_DIR / 'read3_code_to_snomed_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'read3_code_to_snomed_code.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") def extract_nhs_read_browser(): #r2 only - input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ANCESTOR.DBF' + input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ANCESTOR.DBF' df = simpledbf.Dbf5(input_path).to_dataframe() df = pd.concat([df['READCODE'], df['DESCENDANT']]) df = pd.DataFrame(df.drop_duplicates()) df = df.rename(columns={0:"read2_code"}) - output_path = MAPS_PROCESSED_DIR / 'read2_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'read2_code.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") #r2 -> atc - input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ATC.DBF' + input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ATC.DBF' df = simpledbf.Dbf5(input_path).to_dataframe() df = df[["READCODE", "ATC"]] df = df.rename(columns={"READCODE":"read2_code", "ATC":"atc_code"}) - output_path = MAPS_PROCESSED_DIR / 'read2_code_to_atc_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'read2_code_to_atc_code.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") #r2 -> icd10 - input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ICD10.DBF' + input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ICD10.DBF' df = simpledbf.Dbf5(input_path).to_dataframe() df = df[["READ_CODE", "TARG_CODE"]] df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"icd10_code"}) df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-' df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-' - output_path = MAPS_PROCESSED_DIR / 'read2_code_to_icd10_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'read2_code_to_icd10_code.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") #r2 -> opcs4 - input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V2' / 'OPCS4V3.DBF' + input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'OPCS4V3.DBF' df = simpledbf.Dbf5(input_path).to_dataframe() df = df[["READ_CODE", "TARG_CODE"]] df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"opcs4_code"}) df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-' df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-' - output_path = MAPS_PROCESSED_DIR / 'read2_code_to_opcs4_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'read2_code_to_opcs4_code.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") #r3 only - input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ANCESTOR.DBF' + input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ANCESTOR.DBF' df = simpledbf.Dbf5(input_path).to_dataframe() df = pd.concat([df['READCODE'], df['DESCENDANT']]) df = pd.DataFrame(df.drop_duplicates()) df = df.rename(columns={0:"read3_code"}) - output_path = MAPS_PROCESSED_DIR / 'read3_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'read3_code.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") #r3 -> icd10 - input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ICD10.DBF' + input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ICD10.DBF' df = simpledbf.Dbf5(input_path).to_dataframe() df = df[["READ_CODE", "TARG_CODE"]] df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"icd10_code"}) df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-' df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-' - output_path = MAPS_PROCESSED_DIR / 'read3_code_to_icd10_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'read3_code_to_icd10_code.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") @@ -240,13 +240,13 @@ def extract_nhs_read_browser(): # dbf = simpledbf.Dbf5('build/maps/downloads/Standard/V3/ICD9V3.DBF') #r3 -> opcs4 - input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V3' / 'OPCS4V3.DBF' + input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V3' / 'OPCS4V3.DBF' df = simpledbf.Dbf5(input_path).to_dataframe() df = df[["READ_CODE", "TARG_CODE"]] df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"opcs4_code"}) df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-' df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-' - output_path = MAPS_PROCESSED_DIR / 'read3_code_to_opcs4_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'read3_code_to_opcs4_code.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") @@ -255,11 +255,11 @@ def create_map_directories(): # Check if build directory exists create_map_dirs = False - if MAPS_DIR.exists(): - user_input = input(f"The map directory {MAPS_DIR} already exists. Do you want to download and process trud data again? (y/n): ").strip().lower() + if TRUD_DIR.exists(): + user_input = input(f"The map directory {TRUD_DIR} already exists. Do you want to download and process trud data again? (y/n): ").strip().lower() if user_input == "y": # delete all build files - shutil.rmtree(MAPS_DIR) + shutil.rmtree(TRUD_DIR) create_map_dirs = True elif user_input == "n": print("Exiting TRUD installation") @@ -269,9 +269,9 @@ def create_map_directories(): if create_map_dirs: # create maps directories - MAPS_DIR.mkdir(parents=True, exist_ok=True) - MAPS_DOWNLOADS_DIR.mkdir(parents=True, exist_ok=True) - MAPS_PROCESSED_DIR.mkdir(parents=True,exist_ok=True) + TRUD_DIR.mkdir(parents=True, exist_ok=True) + TRUD_DOWNLOADS_DIR.mkdir(parents=True, exist_ok=True) + TRUD_PROCESSED_DIR.mkdir(parents=True,exist_ok=True) def install(api_key): print(f"Installing TRUD")