Skip to content
Snippets Groups Projects
Commit c156c2d0 authored by mjbonifa's avatar mjbonifa
Browse files

refactored trud constants from MAPS to TRUD

parent 95c3d5cf
Branches
Tags
No related merge requests found
...@@ -69,7 +69,7 @@ class Read2_code(Proto_code): ...@@ -69,7 +69,7 @@ class Read2_code(Proto_code):
def __init__(self, file_path=None): def __init__(self, file_path=None):
super().__init__(file_path) super().__init__(file_path)
input_path = trud.MAPS_PROCESSED_DIR / 'read2_code.parquet' input_path = trud.TRUD_PROCESSED_DIR / 'read2_code.parquet'
if not input_path.is_file(): if not input_path.is_file():
raise FileNotFoundError(f"Error: Read2 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") raise FileNotFoundError(f"Error: Read2 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")
self.db = pd.read_parquet(input_path) self.db = pd.read_parquet(input_path)
...@@ -115,7 +115,7 @@ class Read3_code(Proto_code): ...@@ -115,7 +115,7 @@ class Read3_code(Proto_code):
def __init__(self, file_path=None): def __init__(self, file_path=None):
super().__init__(file_path) super().__init__(file_path)
input_path = trud.MAPS_PROCESSED_DIR / 'read3_code.parquet' input_path = trud.TRUD_PROCESSED_DIR / 'read3_code.parquet'
if not input_path.is_file(): if not input_path.is_file():
raise FileNotFoundError(f"Error: Read3 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") raise FileNotFoundError(f"Error: Read3 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")
self.db = pd.read_parquet(input_path) self.db = pd.read_parquet(input_path)
...@@ -160,7 +160,7 @@ class Icd10_code(Proto_code): ...@@ -160,7 +160,7 @@ class Icd10_code(Proto_code):
def __init__(self, file_path=None): def __init__(self, file_path=None):
super().__init__(file_path) super().__init__(file_path)
input_path = trud.MAPS_PROCESSED_DIR / 'icd10_code.parquet' input_path = trud.TRUD_PROCESSED_DIR / 'icd10_code.parquet'
if not input_path.is_file(): if not input_path.is_file():
raise FileNotFoundError(f"Error: ICD10 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") raise FileNotFoundError(f"Error: ICD10 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")
self.db = pd.read_parquet(input_path) self.db = pd.read_parquet(input_path)
...@@ -223,7 +223,7 @@ class Snomed_code(Proto_code): ...@@ -223,7 +223,7 @@ class Snomed_code(Proto_code):
def __init__(self, file_path=None): def __init__(self, file_path=None):
super().__init__(file_path) super().__init__(file_path)
input_path = trud.MAPS_PROCESSED_DIR / 'snomed_code.parquet' input_path = trud.TRUD_PROCESSED_DIR / 'snomed_code.parquet'
if not input_path.is_file(): if not input_path.is_file():
raise FileNotFoundError(f"Error: SNOMED code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") raise FileNotFoundError(f"Error: SNOMED code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")
self.db = pd.read_parquet(input_path) self.db = pd.read_parquet(input_path)
...@@ -280,7 +280,7 @@ class Opcs4_code(Proto_code): ...@@ -280,7 +280,7 @@ class Opcs4_code(Proto_code):
def __init__(self, file_path=None): def __init__(self, file_path=None):
super().__init__(file_path) super().__init__(file_path)
input_path = trud.MAPS_PROCESSED_DIR / 'opcs4_code.parquet' input_path = trud.TRUD_PROCESSED_DIR / 'opcs4_code.parquet'
if not input_path.is_file(): if not input_path.is_file():
raise FileNotFoundError(f"Error: OPCS4 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") raise FileNotFoundError(f"Error: OPCS4 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")
self.db = pd.read_parquet(input_path) self.db = pd.read_parquet(input_path)
......
...@@ -191,7 +191,7 @@ def convert_codes(df, target, translate): ...@@ -191,7 +191,7 @@ def convert_codes(df, target, translate):
print(f"target type {target}") print(f"target type {target}")
for col_name in df.columns[df.columns != target]: for col_name in df.columns[df.columns != target]:
filename = f"{col_name}_to_{target}.parquet" filename = f"{col_name}_to_{target}.parquet"
map_path = trud.MAPS_PROCESSED_DIR / filename map_path = trud.TRUD_PROCESSED_DIR / filename
if map_path.exists(): if map_path.exists():
col = df[col_name] col = df[col_name]
df_map = pd.read_parquet(map_path) df_map = pd.read_parquet(map_path)
......
...@@ -15,9 +15,9 @@ import simpledbf ...@@ -15,9 +15,9 @@ import simpledbf
# Constants # Constants
FQDN = "isd.digital.nhs.uk" FQDN = "isd.digital.nhs.uk"
MAPS_DIR = Path('./build/trud') TRUD_DIR = Path('./build/trud')
MAPS_DOWNLOADS_DIR = MAPS_DIR / 'downloads' TRUD_DOWNLOADS_DIR = TRUD_DIR / 'downloads'
MAPS_PROCESSED_DIR = MAPS_DIR / 'processed' TRUD_PROCESSED_DIR = TRUD_DIR / 'processed'
def error_exit(message): def error_exit(message):
print(message, "error") print(message, "error")
...@@ -44,7 +44,7 @@ def get_releases(item_id, API_KEY, latest=False): ...@@ -44,7 +44,7 @@ def get_releases(item_id, API_KEY, latest=False):
return data.get("releases", []) return data.get("releases", [])
def download_release_file(item_id, release_ordinal, release, file_json_prefix, file_type=None, items_folder=MAPS_DOWNLOADS_DIR): def download_release_file(item_id, release_ordinal, release, file_json_prefix, file_type=None, items_folder=TRUD_DOWNLOADS_DIR):
"""Download specified file type for a given release of an item.""" """Download specified file type for a given release of an item."""
# check folder is a directory # check folder is a directory
...@@ -54,7 +54,7 @@ def download_release_file(item_id, release_ordinal, release, file_json_prefix, f ...@@ -54,7 +54,7 @@ def download_release_file(item_id, release_ordinal, release, file_json_prefix, f
file_type = file_type or file_json_prefix file_type = file_type or file_json_prefix
file_url = release.get(f"{file_json_prefix}FileUrl") file_url = release.get(f"{file_json_prefix}FileUrl")
file_name = release.get(f"{file_json_prefix}FileName") file_name = release.get(f"{file_json_prefix}FileName")
file_destination = MAPS_DOWNLOADS_DIR / file_name file_destination = TRUD_DOWNLOADS_DIR / file_name
if not file_url or not file_name: if not file_url or not file_name:
error_exit(f"Missing {file_type} file information for release {release_ordinal} of item {item_id}.") error_exit(f"Missing {file_type} file information for release {release_ordinal} of item {item_id}.")
...@@ -78,7 +78,7 @@ def validate_download_hash(file_destination:str, item_hash:str): ...@@ -78,7 +78,7 @@ def validate_download_hash(file_destination:str, item_hash:str):
else: else:
error_exit(f"Could not validate origin of {file_destination}. The SHA-256 hash should be: {item_hash}, but got {hash} instead") error_exit(f"Could not validate origin of {file_destination}. The SHA-256 hash should be: {item_hash}, but got {hash} instead")
def unzip_download(file_destination:str, items_folder=MAPS_DOWNLOADS_DIR): def unzip_download(file_destination:str, items_folder=TRUD_DOWNLOADS_DIR):
# check folder is a directory # check folder is a directory
if not items_folder.is_dir(): if not items_folder.is_dir():
...@@ -89,24 +89,24 @@ def unzip_download(file_destination:str, items_folder=MAPS_DOWNLOADS_DIR): ...@@ -89,24 +89,24 @@ def unzip_download(file_destination:str, items_folder=MAPS_DOWNLOADS_DIR):
def extract_icd10(): def extract_icd10():
#ICD10_edition5 #ICD10_edition5
file_path = MAPS_DOWNLOADS_DIR / 'ICD10_Edition5_XML_20160401' / 'Content' / 'ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml' file_path = TRUD_DOWNLOADS_DIR / 'ICD10_Edition5_XML_20160401' / 'Content' / 'ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml'
df = pd.read_xml(file_path) df = pd.read_xml(file_path)
df = df[["CODE", "ALT_CODE", "DESCRIPTION"]] df = df[["CODE", "ALT_CODE", "DESCRIPTION"]]
df = df.rename(columns={"CODE":"icd10_code", df = df.rename(columns={"CODE":"icd10_code",
"ALT_CODE":"icd10_alt_code", "ALT_CODE":"icd10_alt_code",
"DESCRIPTION":"description" "DESCRIPTION":"description"
}) })
output_path = MAPS_PROCESSED_DIR / 'icd10_code.parquet' output_path = TRUD_PROCESSED_DIR / 'icd10_code.parquet'
df.to_parquet(output_path, index=False) df.to_parquet(output_path, index=False)
print(f"Extracted: {output_path}") print(f"Extracted: {output_path}")
def extract_opsc4(): def extract_opsc4():
file_path = MAPS_DOWNLOADS_DIR / 'OPCS410 Data files txt' / 'OPCS410 CodesAndTitles Nov 2022 V1.0.txt' file_path = TRUD_DOWNLOADS_DIR / 'OPCS410 Data files txt' / 'OPCS410 CodesAndTitles Nov 2022 V1.0.txt'
df = pd.read_csv(file_path, sep='\t', dtype=str, header=None) df = pd.read_csv(file_path, sep='\t', dtype=str, header=None)
df = df.rename(columns={0:"opcs4_code", 1:"description"}) df = df.rename(columns={0:"opcs4_code", 1:"description"})
output_path = MAPS_PROCESSED_DIR / 'opcs4_code.parquet' output_path = TRUD_PROCESSED_DIR / 'opcs4_code.parquet'
df.to_parquet(output_path, index=False) df.to_parquet(output_path, index=False)
print(f"Extracted: {output_path}") print(f"Extracted: {output_path}")
...@@ -114,30 +114,30 @@ def extract_nhs_data_migrations(): ...@@ -114,30 +114,30 @@ def extract_nhs_data_migrations():
#NHS Data Migrations #NHS Data Migrations
#snomed only #snomed only
file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'sctcremap_uk_20200401000001.txt' file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'sctcremap_uk_20200401000001.txt'
df = pd.read_csv(file_path, sep='\t') df = pd.read_csv(file_path, sep='\t')
df = df[["SCT_CONCEPTID"]] df = df[["SCT_CONCEPTID"]]
df = df.rename(columns={"SCT_CONCEPTID":"snomed_code"}) df = df.rename(columns={"SCT_CONCEPTID":"snomed_code"})
df = df.drop_duplicates() df = df.drop_duplicates()
df = df.astype(str) df = df.astype(str)
output_path = MAPS_PROCESSED_DIR / 'snomed_code.parquet' output_path = TRUD_PROCESSED_DIR / 'snomed_code.parquet'
df.to_parquet(output_path, index=False) df.to_parquet(output_path, index=False)
print(f"Extracted: {output_path}") print(f"Extracted: {output_path}")
#r2 -> r3 #r2 -> r3
file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rctctv3map_uk_20200401000001.txt' file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rctctv3map_uk_20200401000001.txt'
df = pd.read_csv(file_path, sep='\t') df = pd.read_csv(file_path, sep='\t')
df = df[["V2_CONCEPTID", "CTV3_CONCEPTID"]] df = df[["V2_CONCEPTID", "CTV3_CONCEPTID"]]
df = df.rename(columns={"V2_CONCEPTID":"read2_code", df = df.rename(columns={"V2_CONCEPTID":"read2_code",
"CTV3_CONCEPTID":"read3_code"}) "CTV3_CONCEPTID":"read3_code"})
output_path = MAPS_PROCESSED_DIR / 'read2_code_to_read3_code.parquet' output_path = TRUD_PROCESSED_DIR / 'read2_code_to_read3_code.parquet'
df.to_parquet(output_path, index=False) df.to_parquet(output_path, index=False)
print(f"Extracted: {output_path}") print(f"Extracted: {output_path}")
#r3->r2 #r3->r2
file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3rctmap_uk_20200401000002.txt' file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3rctmap_uk_20200401000002.txt'
df = pd.read_csv(file_path, sep='\t') df = pd.read_csv(file_path, sep='\t')
df = df[["CTV3_CONCEPTID", "V2_CONCEPTID"]] df = df[["CTV3_CONCEPTID", "V2_CONCEPTID"]]
df = df.rename(columns={"CTV3_CONCEPTID":"read3_code", df = df.rename(columns={"CTV3_CONCEPTID":"read3_code",
...@@ -145,23 +145,23 @@ def extract_nhs_data_migrations(): ...@@ -145,23 +145,23 @@ def extract_nhs_data_migrations():
df = df.drop_duplicates() df = df.drop_duplicates()
df = df[~df["read2_code"].str.match("^.*_.*$")] #remove r2 codes with '_' df = df[~df["read2_code"].str.match("^.*_.*$")] #remove r2 codes with '_'
output_path = MAPS_PROCESSED_DIR / 'read3_code_to_read2_code.parquet' output_path = TRUD_PROCESSED_DIR / 'read3_code_to_read2_code.parquet'
df.to_parquet(output_path, index=False) df.to_parquet(output_path, index=False)
print(f"Extracted: {output_path}") print(f"Extracted: {output_path}")
#r2 -> snomed #r2 -> snomed
file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rcsctmap2_uk_20200401000001.txt' file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rcsctmap2_uk_20200401000001.txt'
df = pd.read_csv(file_path, sep='\t', dtype=str) df = pd.read_csv(file_path, sep='\t', dtype=str)
df = df[["ReadCode", "ConceptId"]] df = df[["ReadCode", "ConceptId"]]
df = df.rename(columns={"ReadCode":"read2_code", df = df.rename(columns={"ReadCode":"read2_code",
"ConceptId":"snomed_code"}) "ConceptId":"snomed_code"})
output_path = MAPS_PROCESSED_DIR / 'read2_code_to_snomed_code.parquet' output_path = TRUD_PROCESSED_DIR / 'read2_code_to_snomed_code.parquet'
df.to_parquet(output_path, index=False) df.to_parquet(output_path, index=False)
print(f"Extracted: {output_path}") print(f"Extracted: {output_path}")
#r3->snomed #r3->snomed
file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3sctmap2_uk_20200401000001.txt' file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3sctmap2_uk_20200401000001.txt'
df = pd.read_csv(file_path, sep='\t', dtype=str) df = pd.read_csv(file_path, sep='\t', dtype=str)
df = df[["CTV3_TERMID", "SCT_CONCEPTID"]] df = df[["CTV3_TERMID", "SCT_CONCEPTID"]]
df = df.rename(columns={"CTV3_TERMID":"read3_code", df = df.rename(columns={"CTV3_TERMID":"read3_code",
...@@ -169,70 +169,70 @@ def extract_nhs_data_migrations(): ...@@ -169,70 +169,70 @@ def extract_nhs_data_migrations():
df["snomed_code"] = df["snomed_code"].astype(str) df["snomed_code"] = df["snomed_code"].astype(str)
df = df[~df["snomed_code"].str.match("^.*_.*$")] #remove snomed codes with '_' df = df[~df["snomed_code"].str.match("^.*_.*$")] #remove snomed codes with '_'
output_path = MAPS_PROCESSED_DIR / 'read3_code_to_snomed_code.parquet' output_path = TRUD_PROCESSED_DIR / 'read3_code_to_snomed_code.parquet'
df.to_parquet(output_path, index=False) df.to_parquet(output_path, index=False)
print(f"Extracted: {output_path}") print(f"Extracted: {output_path}")
def extract_nhs_read_browser(): def extract_nhs_read_browser():
#r2 only #r2 only
input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ANCESTOR.DBF' input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ANCESTOR.DBF'
df = simpledbf.Dbf5(input_path).to_dataframe() df = simpledbf.Dbf5(input_path).to_dataframe()
df = pd.concat([df['READCODE'], df['DESCENDANT']]) df = pd.concat([df['READCODE'], df['DESCENDANT']])
df = pd.DataFrame(df.drop_duplicates()) df = pd.DataFrame(df.drop_duplicates())
df = df.rename(columns={0:"read2_code"}) df = df.rename(columns={0:"read2_code"})
output_path = MAPS_PROCESSED_DIR / 'read2_code.parquet' output_path = TRUD_PROCESSED_DIR / 'read2_code.parquet'
df.to_parquet(output_path, index=False) df.to_parquet(output_path, index=False)
print(f"Extracted: {output_path}") print(f"Extracted: {output_path}")
#r2 -> atc #r2 -> atc
input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ATC.DBF' input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ATC.DBF'
df = simpledbf.Dbf5(input_path).to_dataframe() df = simpledbf.Dbf5(input_path).to_dataframe()
df = df[["READCODE", "ATC"]] df = df[["READCODE", "ATC"]]
df = df.rename(columns={"READCODE":"read2_code", "ATC":"atc_code"}) df = df.rename(columns={"READCODE":"read2_code", "ATC":"atc_code"})
output_path = MAPS_PROCESSED_DIR / 'read2_code_to_atc_code.parquet' output_path = TRUD_PROCESSED_DIR / 'read2_code_to_atc_code.parquet'
df.to_parquet(output_path, index=False) df.to_parquet(output_path, index=False)
print(f"Extracted: {output_path}") print(f"Extracted: {output_path}")
#r2 -> icd10 #r2 -> icd10
input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ICD10.DBF' input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ICD10.DBF'
df = simpledbf.Dbf5(input_path).to_dataframe() df = simpledbf.Dbf5(input_path).to_dataframe()
df = df[["READ_CODE", "TARG_CODE"]] df = df[["READ_CODE", "TARG_CODE"]]
df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"icd10_code"}) df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"icd10_code"})
df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-' df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-'
df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-' df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-'
output_path = MAPS_PROCESSED_DIR / 'read2_code_to_icd10_code.parquet' output_path = TRUD_PROCESSED_DIR / 'read2_code_to_icd10_code.parquet'
df.to_parquet(output_path, index=False) df.to_parquet(output_path, index=False)
print(f"Extracted: {output_path}") print(f"Extracted: {output_path}")
#r2 -> opcs4 #r2 -> opcs4
input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V2' / 'OPCS4V3.DBF' input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'OPCS4V3.DBF'
df = simpledbf.Dbf5(input_path).to_dataframe() df = simpledbf.Dbf5(input_path).to_dataframe()
df = df[["READ_CODE", "TARG_CODE"]] df = df[["READ_CODE", "TARG_CODE"]]
df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"opcs4_code"}) df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"opcs4_code"})
df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-' df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-'
df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-' df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-'
output_path = MAPS_PROCESSED_DIR / 'read2_code_to_opcs4_code.parquet' output_path = TRUD_PROCESSED_DIR / 'read2_code_to_opcs4_code.parquet'
df.to_parquet(output_path, index=False) df.to_parquet(output_path, index=False)
print(f"Extracted: {output_path}") print(f"Extracted: {output_path}")
#r3 only #r3 only
input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ANCESTOR.DBF' input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ANCESTOR.DBF'
df = simpledbf.Dbf5(input_path).to_dataframe() df = simpledbf.Dbf5(input_path).to_dataframe()
df = pd.concat([df['READCODE'], df['DESCENDANT']]) df = pd.concat([df['READCODE'], df['DESCENDANT']])
df = pd.DataFrame(df.drop_duplicates()) df = pd.DataFrame(df.drop_duplicates())
df = df.rename(columns={0:"read3_code"}) df = df.rename(columns={0:"read3_code"})
output_path = MAPS_PROCESSED_DIR / 'read3_code.parquet' output_path = TRUD_PROCESSED_DIR / 'read3_code.parquet'
df.to_parquet(output_path, index=False) df.to_parquet(output_path, index=False)
print(f"Extracted: {output_path}") print(f"Extracted: {output_path}")
#r3 -> icd10 #r3 -> icd10
input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ICD10.DBF' input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ICD10.DBF'
df = simpledbf.Dbf5(input_path).to_dataframe() df = simpledbf.Dbf5(input_path).to_dataframe()
df = df[["READ_CODE", "TARG_CODE"]] df = df[["READ_CODE", "TARG_CODE"]]
df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"icd10_code"}) df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"icd10_code"})
df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-' df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-'
df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-' df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-'
output_path = MAPS_PROCESSED_DIR / 'read3_code_to_icd10_code.parquet' output_path = TRUD_PROCESSED_DIR / 'read3_code_to_icd10_code.parquet'
df.to_parquet(output_path, index=False) df.to_parquet(output_path, index=False)
print(f"Extracted: {output_path}") print(f"Extracted: {output_path}")
...@@ -240,13 +240,13 @@ def extract_nhs_read_browser(): ...@@ -240,13 +240,13 @@ def extract_nhs_read_browser():
# dbf = simpledbf.Dbf5('build/maps/downloads/Standard/V3/ICD9V3.DBF') # dbf = simpledbf.Dbf5('build/maps/downloads/Standard/V3/ICD9V3.DBF')
#r3 -> opcs4 #r3 -> opcs4
input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V3' / 'OPCS4V3.DBF' input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V3' / 'OPCS4V3.DBF'
df = simpledbf.Dbf5(input_path).to_dataframe() df = simpledbf.Dbf5(input_path).to_dataframe()
df = df[["READ_CODE", "TARG_CODE"]] df = df[["READ_CODE", "TARG_CODE"]]
df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"opcs4_code"}) df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"opcs4_code"})
df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-' df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-'
df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-' df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-'
output_path = MAPS_PROCESSED_DIR / 'read3_code_to_opcs4_code.parquet' output_path = TRUD_PROCESSED_DIR / 'read3_code_to_opcs4_code.parquet'
df.to_parquet(output_path, index=False) df.to_parquet(output_path, index=False)
print(f"Extracted: {output_path}") print(f"Extracted: {output_path}")
...@@ -255,11 +255,11 @@ def create_map_directories(): ...@@ -255,11 +255,11 @@ def create_map_directories():
# Check if build directory exists # Check if build directory exists
create_map_dirs = False create_map_dirs = False
if MAPS_DIR.exists(): if TRUD_DIR.exists():
user_input = input(f"The map directory {MAPS_DIR} already exists. Do you want to download and process trud data again? (y/n): ").strip().lower() user_input = input(f"The map directory {TRUD_DIR} already exists. Do you want to download and process trud data again? (y/n): ").strip().lower()
if user_input == "y": if user_input == "y":
# delete all build files # delete all build files
shutil.rmtree(MAPS_DIR) shutil.rmtree(TRUD_DIR)
create_map_dirs = True create_map_dirs = True
elif user_input == "n": elif user_input == "n":
print("Exiting TRUD installation") print("Exiting TRUD installation")
...@@ -269,9 +269,9 @@ def create_map_directories(): ...@@ -269,9 +269,9 @@ def create_map_directories():
if create_map_dirs: if create_map_dirs:
# create maps directories # create maps directories
MAPS_DIR.mkdir(parents=True, exist_ok=True) TRUD_DIR.mkdir(parents=True, exist_ok=True)
MAPS_DOWNLOADS_DIR.mkdir(parents=True, exist_ok=True) TRUD_DOWNLOADS_DIR.mkdir(parents=True, exist_ok=True)
MAPS_PROCESSED_DIR.mkdir(parents=True,exist_ok=True) TRUD_PROCESSED_DIR.mkdir(parents=True,exist_ok=True)
def install(api_key): def install(api_key):
print(f"Installing TRUD") print(f"Installing TRUD")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment