refactored trud constants from MAPS to TRUD

c156c2d0 · mjbonifa · 95c3d5cf · c156c2d0 · c156c2d0 · c156c2d0
Commit c156c2d0 authored 5 months ago by mjbonifa
--- a/parse.py
+++ b/parse.py
@@ -69,7 +69,7 @@ class Read2_code(Proto_code):
    def __init__(self, file_path=None):
        super().__init__(file_path)
-        input_path = trud.MAPS_PROCESSED_DIR / 'read2_code.parquet'
+        input_path = trud.TRUD_PROCESSED_DIR / 'read2_code.parquet'
        if not input_path.is_file():  
            raise FileNotFoundError(f"Error: Read2 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")   
        self.db = pd.read_parquet(input_path)
@@ -115,7 +115,7 @@ class Read3_code(Proto_code):
 	def __init__(self, file_path=None):
 		super().__init__(file_path)
-		input_path = trud.MAPS_PROCESSED_DIR / 'read3_code.parquet'
+		input_path = trud.TRUD_PROCESSED_DIR / 'read3_code.parquet'
 		if not input_path.is_file():  
 			raise FileNotFoundError(f"Error: Read3 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")               
 		self.db = pd.read_parquet(input_path)
@@ -160,7 +160,7 @@ class Icd10_code(Proto_code):
 	def __init__(self, file_path=None):
 		super().__init__(file_path)
-		input_path = trud.MAPS_PROCESSED_DIR / 'icd10_code.parquet'
+		input_path = trud.TRUD_PROCESSED_DIR / 'icd10_code.parquet'
 		if not input_path.is_file():  
 			raise FileNotFoundError(f"Error: ICD10 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")               
 		self.db = pd.read_parquet(input_path)
@@ -223,7 +223,7 @@ class Snomed_code(Proto_code):
 	def __init__(self, file_path=None):
 		super().__init__(file_path)
-		input_path = trud.MAPS_PROCESSED_DIR / 'snomed_code.parquet'
+		input_path = trud.TRUD_PROCESSED_DIR / 'snomed_code.parquet'
 		if not input_path.is_file():  
 			raise FileNotFoundError(f"Error: SNOMED code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")               
 		self.db = pd.read_parquet(input_path)        
@@ -280,7 +280,7 @@ class Opcs4_code(Proto_code):
 	def __init__(self, file_path=None):
 		super().__init__(file_path)
-		input_path = trud.MAPS_PROCESSED_DIR / 'opcs4_code.parquet'
+		input_path = trud.TRUD_PROCESSED_DIR / 'opcs4_code.parquet'
 		if not input_path.is_file():  
 			raise FileNotFoundError(f"Error: OPCS4 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")               
 		self.db = pd.read_parquet(input_path)          

--- a/phen.py
+++ b/phen.py
@@ -191,7 +191,7 @@ def convert_codes(df, target, translate):
 		print(f"target type {target}")
 		for col_name in df.columns[df.columns != target]:
 			filename = f"{col_name}_to_{target}.parquet"
-			map_path = trud.MAPS_PROCESSED_DIR / filename
+			map_path = trud.TRUD_PROCESSED_DIR / filename
 			if map_path.exists():
 				col = df[col_name]
 				df_map = pd.read_parquet(map_path)

--- a/trud.py
+++ b/trud.py
@@ -15,9 +15,9 @@ import simpledbf
 # Constants
 FQDN = "isd.digital.nhs.uk"
-MAPS_DIR = Path('./build/trud')
+TRUD_DIR = Path('./build/trud')
-MAPS_DOWNLOADS_DIR = MAPS_DIR / 'downloads'
+TRUD_DOWNLOADS_DIR = TRUD_DIR / 'downloads'
-MAPS_PROCESSED_DIR = MAPS_DIR / 'processed'
+TRUD_PROCESSED_DIR = TRUD_DIR / 'processed'
 def error_exit(message):
    print(message, "error")
@@ -44,7 +44,7 @@ def get_releases(item_id, API_KEY, latest=False):
    return data.get("releases", [])
-def download_release_file(item_id, release_ordinal, release, file_json_prefix, file_type=None, items_folder=MAPS_DOWNLOADS_DIR):
+def download_release_file(item_id, release_ordinal, release, file_json_prefix, file_type=None, items_folder=TRUD_DOWNLOADS_DIR):
    """Download specified file type for a given release of an item."""
    # check folder is a directory
@@ -54,7 +54,7 @@ def download_release_file(item_id, release_ordinal, release, file_json_prefix, f
    file_type = file_type or file_json_prefix
    file_url = release.get(f"{file_json_prefix}FileUrl")
    file_name = release.get(f"{file_json_prefix}FileName")
-    file_destination = MAPS_DOWNLOADS_DIR / file_name 
+    file_destination = TRUD_DOWNLOADS_DIR / file_name 
    if not file_url or not file_name:
        error_exit(f"Missing {file_type} file information for release {release_ordinal} of item {item_id}.")
@@ -78,7 +78,7 @@ def validate_download_hash(file_destination:str, item_hash:str):
    else:
        error_exit(f"Could not validate origin of {file_destination}. The SHA-256 hash should be: {item_hash}, but got {hash} instead")
-def unzip_download(file_destination:str, items_folder=MAPS_DOWNLOADS_DIR):
+def unzip_download(file_destination:str, items_folder=TRUD_DOWNLOADS_DIR):
    # check folder is a directory
    if not items_folder.is_dir():
@@ -89,24 +89,24 @@ def unzip_download(file_destination:str, items_folder=MAPS_DOWNLOADS_DIR):
 def extract_icd10():
    #ICD10_edition5
-    file_path = MAPS_DOWNLOADS_DIR / 'ICD10_Edition5_XML_20160401' / 'Content' / 'ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml'
+    file_path = TRUD_DOWNLOADS_DIR / 'ICD10_Edition5_XML_20160401' / 'Content' / 'ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml'
    df = pd.read_xml(file_path)
    df = df[["CODE", "ALT_CODE", "DESCRIPTION"]]
    df = df.rename(columns={"CODE":"icd10_code",
                            "ALT_CODE":"icd10_alt_code",
                            "DESCRIPTION":"description"
                        })
-    output_path = MAPS_PROCESSED_DIR / 'icd10_code.parquet'
+    output_path = TRUD_PROCESSED_DIR / 'icd10_code.parquet'
    df.to_parquet(output_path, index=False)
    print(f"Extracted: {output_path}")
 def extract_opsc4():
-    file_path = MAPS_DOWNLOADS_DIR / 'OPCS410 Data files txt' / 'OPCS410 CodesAndTitles Nov 2022 V1.0.txt'
+    file_path = TRUD_DOWNLOADS_DIR / 'OPCS410 Data files txt' / 'OPCS410 CodesAndTitles Nov 2022 V1.0.txt'
    df = pd.read_csv(file_path, sep='\t', dtype=str, header=None)
    df = df.rename(columns={0:"opcs4_code", 1:"description"})
-    output_path = MAPS_PROCESSED_DIR / 'opcs4_code.parquet'   
+    output_path = TRUD_PROCESSED_DIR / 'opcs4_code.parquet'   
    df.to_parquet(output_path, index=False)
    print(f"Extracted: {output_path}")
@@ -114,30 +114,30 @@ def extract_nhs_data_migrations():
    #NHS Data Migrations
    #snomed only
-    file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'sctcremap_uk_20200401000001.txt'    
+    file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'sctcremap_uk_20200401000001.txt'    
    df = pd.read_csv(file_path, sep='\t')    
    df = df[["SCT_CONCEPTID"]]
    df = df.rename(columns={"SCT_CONCEPTID":"snomed_code"})
    df = df.drop_duplicates()
    df = df.astype(str)
-    output_path = MAPS_PROCESSED_DIR / 'snomed_code.parquet'    
+    output_path = TRUD_PROCESSED_DIR / 'snomed_code.parquet'    
    df.to_parquet(output_path, index=False)
    print(f"Extracted: {output_path}")
    #r2 -> r3
-    file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rctctv3map_uk_20200401000001.txt'
+    file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rctctv3map_uk_20200401000001.txt'
    df = pd.read_csv(file_path, sep='\t')
    df = df[["V2_CONCEPTID", "CTV3_CONCEPTID"]]
    df = df.rename(columns={"V2_CONCEPTID":"read2_code",
                            "CTV3_CONCEPTID":"read3_code"})
-    output_path = MAPS_PROCESSED_DIR / 'read2_code_to_read3_code.parquet'   
+    output_path = TRUD_PROCESSED_DIR / 'read2_code_to_read3_code.parquet'   
    df.to_parquet(output_path, index=False)
    print(f"Extracted: {output_path}")
    #r3->r2
-    file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3rctmap_uk_20200401000002.txt'
+    file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3rctmap_uk_20200401000002.txt'
    df = pd.read_csv(file_path, sep='\t')
    df = df[["CTV3_CONCEPTID", "V2_CONCEPTID"]]
    df = df.rename(columns={"CTV3_CONCEPTID":"read3_code", 
@@ -145,23 +145,23 @@ def extract_nhs_data_migrations():
    df = df.drop_duplicates()
    df = df[~df["read2_code"].str.match("^.*_.*$")] #remove r2 codes with '_'
-    output_path = MAPS_PROCESSED_DIR / 'read3_code_to_read2_code.parquet'   
+    output_path = TRUD_PROCESSED_DIR / 'read3_code_to_read2_code.parquet'   
    df.to_parquet(output_path, index=False)
    print(f"Extracted: {output_path}")
    #r2 -> snomed
-    file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rcsctmap2_uk_20200401000001.txt'
+    file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rcsctmap2_uk_20200401000001.txt'
    df = pd.read_csv(file_path, sep='\t', dtype=str)
    df = df[["ReadCode", "ConceptId"]]
    df = df.rename(columns={"ReadCode":"read2_code",
                            "ConceptId":"snomed_code"})
-    output_path = MAPS_PROCESSED_DIR / 'read2_code_to_snomed_code.parquet'   
+    output_path = TRUD_PROCESSED_DIR / 'read2_code_to_snomed_code.parquet'   
    df.to_parquet(output_path, index=False)
    print(f"Extracted: {output_path}")
    #r3->snomed
-    file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3sctmap2_uk_20200401000001.txt'
+    file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3sctmap2_uk_20200401000001.txt'
    df = pd.read_csv(file_path, sep='\t', dtype=str)
    df = df[["CTV3_TERMID", "SCT_CONCEPTID"]]
    df = df.rename(columns={"CTV3_TERMID":"read3_code",
@@ -169,70 +169,70 @@ def extract_nhs_data_migrations():
    df["snomed_code"] = df["snomed_code"].astype(str)
    df = df[~df["snomed_code"].str.match("^.*_.*$")] #remove snomed codes with '_'
-    output_path = MAPS_PROCESSED_DIR / 'read3_code_to_snomed_code.parquet'   
+    output_path = TRUD_PROCESSED_DIR / 'read3_code_to_snomed_code.parquet'   
    df.to_parquet(output_path, index=False)
    print(f"Extracted: {output_path}")    
 def extract_nhs_read_browser():
    #r2 only
-    input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ANCESTOR.DBF'
+    input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ANCESTOR.DBF'
    df = simpledbf.Dbf5(input_path).to_dataframe()
    df = pd.concat([df['READCODE'], df['DESCENDANT']])
    df = pd.DataFrame(df.drop_duplicates())
    df = df.rename(columns={0:"read2_code"})
-    output_path = MAPS_PROCESSED_DIR / 'read2_code.parquet'   
+    output_path = TRUD_PROCESSED_DIR / 'read2_code.parquet'   
    df.to_parquet(output_path, index=False)
    print(f"Extracted: {output_path}")    
    #r2 -> atc
-    input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ATC.DBF'   
+    input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ATC.DBF'   
    df = simpledbf.Dbf5(input_path).to_dataframe()
    df = df[["READCODE", "ATC"]]
    df = df.rename(columns={"READCODE":"read2_code", "ATC":"atc_code"})
-    output_path = MAPS_PROCESSED_DIR / 'read2_code_to_atc_code.parquet'   
+    output_path = TRUD_PROCESSED_DIR / 'read2_code_to_atc_code.parquet'   
    df.to_parquet(output_path, index=False)
    print(f"Extracted: {output_path}")        
    #r2 -> icd10
-    input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ICD10.DBF'      
+    input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ICD10.DBF'      
    df = simpledbf.Dbf5(input_path).to_dataframe()        
    df = df[["READ_CODE", "TARG_CODE"]]
    df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"icd10_code"})
    df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-'
    df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-'
-    output_path = MAPS_PROCESSED_DIR / 'read2_code_to_icd10_code.parquet'   
+    output_path = TRUD_PROCESSED_DIR / 'read2_code_to_icd10_code.parquet'   
    df.to_parquet(output_path, index=False)
    print(f"Extracted: {output_path}")      
    #r2 -> opcs4
-    input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V2' / 'OPCS4V3.DBF'  
+    input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'OPCS4V3.DBF'  
    df = simpledbf.Dbf5(input_path).to_dataframe()        
    df = df[["READ_CODE", "TARG_CODE"]]
    df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"opcs4_code"})
    df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-'
    df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-'
-    output_path = MAPS_PROCESSED_DIR / 'read2_code_to_opcs4_code.parquet'   
+    output_path = TRUD_PROCESSED_DIR / 'read2_code_to_opcs4_code.parquet'   
    df.to_parquet(output_path, index=False)
    print(f"Extracted: {output_path}")      
    #r3 only
-    input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ANCESTOR.DBF'    
+    input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ANCESTOR.DBF'    
    df = simpledbf.Dbf5(input_path).to_dataframe()    
    df = pd.concat([df['READCODE'], df['DESCENDANT']])
    df = pd.DataFrame(df.drop_duplicates())
    df = df.rename(columns={0:"read3_code"})
-    output_path = MAPS_PROCESSED_DIR / 'read3_code.parquet'   
+    output_path = TRUD_PROCESSED_DIR / 'read3_code.parquet'   
    df.to_parquet(output_path, index=False)
    print(f"Extracted: {output_path}")     
    #r3 -> icd10
-    input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ICD10.DBF'        
+    input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ICD10.DBF'        
    df = simpledbf.Dbf5(input_path).to_dataframe()
    df = df[["READ_CODE", "TARG_CODE"]]
    df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"icd10_code"})
    df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-'
    df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-'
-    output_path = MAPS_PROCESSED_DIR / 'read3_code_to_icd10_code.parquet'   
+    output_path = TRUD_PROCESSED_DIR / 'read3_code_to_icd10_code.parquet'   
    df.to_parquet(output_path, index=False)
    print(f"Extracted: {output_path}")  
@@ -240,13 +240,13 @@ def extract_nhs_read_browser():
    # dbf = simpledbf.Dbf5('build/maps/downloads/Standard/V3/ICD9V3.DBF')
    #r3 -> opcs4
-    input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V3' / 'OPCS4V3.DBF'      
+    input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V3' / 'OPCS4V3.DBF'      
    df = simpledbf.Dbf5(input_path).to_dataframe()
    df = df[["READ_CODE", "TARG_CODE"]]
    df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"opcs4_code"})
    df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-'
    df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-'
-    output_path = MAPS_PROCESSED_DIR / 'read3_code_to_opcs4_code.parquet'   
+    output_path = TRUD_PROCESSED_DIR / 'read3_code_to_opcs4_code.parquet'   
    df.to_parquet(output_path, index=False)
    print(f"Extracted: {output_path}")      
@@ -255,11 +255,11 @@ def create_map_directories():
    # Check if build directory exists
    create_map_dirs = False   
-    if MAPS_DIR.exists(): 
+    if TRUD_DIR.exists(): 
-        user_input = input(f"The map directory {MAPS_DIR} already exists. Do you want to download and process trud data again? (y/n): ").strip().lower()
+        user_input = input(f"The map directory {TRUD_DIR} already exists. Do you want to download and process trud data again? (y/n): ").strip().lower()
        if user_input == "y":
            # delete all build files
-            shutil.rmtree(MAPS_DIR)
+            shutil.rmtree(TRUD_DIR)
            create_map_dirs = True
        elif user_input == "n":
            print("Exiting TRUD installation")
@@ -269,9 +269,9 @@ def create_map_directories():
    if create_map_dirs:
        # create maps directories
-        MAPS_DIR.mkdir(parents=True, exist_ok=True)
+        TRUD_DIR.mkdir(parents=True, exist_ok=True)
-        MAPS_DOWNLOADS_DIR.mkdir(parents=True, exist_ok=True)            
+        TRUD_DOWNLOADS_DIR.mkdir(parents=True, exist_ok=True)            
-        MAPS_PROCESSED_DIR.mkdir(parents=True,exist_ok=True)                                 
+        TRUD_PROCESSED_DIR.mkdir(parents=True,exist_ok=True)                                 
 def install(api_key):
    print(f"Installing TRUD")