From c87d14118c99056954e7d91311da373277f8a0ae Mon Sep 17 00:00:00 2001 From: Michael Boniface <m.j.boniface@soton.ac.uk> Date: Tue, 18 Feb 2025 14:51:00 +0000 Subject: [PATCH] shorted all the _code references removing _code as not needed and maps API and configuration easier --- acmc.py | 2 +- examples/config.json | 2 +- examples/config2.json | 6 +-- parse.py | 110 +++++++++++++++++++++--------------------- phen.py | 2 +- trud.py | 88 ++++++++++++++++----------------- 6 files changed, 105 insertions(+), 105 deletions(-) diff --git a/acmc.py b/acmc.py index e79b7f9..8d36f9d 100644 --- a/acmc.py +++ b/acmc.py @@ -109,7 +109,7 @@ def main(): # phen map phen_map_parser = phen_subparsers.add_parser("map", help="Process phen mapping") phen_map_parser.add_argument("-d", "--phen-dir", type=str, default=str(phen.DEFAULT_PHEN_PATH.resolve()), help="Phenotype directory") - phen_map_parser.add_argument("-t", "--target-coding", required=True, choices=['read2_code', 'read3_code', 'icd10_code', 'snomed_code', 'opcs4_code'], help="Specify the target coding (read2_code, read3_code, icd10_code, snomed_code, opcs4_code)") + phen_map_parser.add_argument("-t", "--target-coding", required=True, choices=['read2', 'read3', 'icd10', 'snomed', 'opcs4'], help="Specify the target coding (read2, read3, icd10, snomed, opcs4)") # phen map flags phen_map_parser.add_argument("-tr", "--translate", action="store_true", default=False, help="Translate code types") phen_map_parser.add_argument("-v", "--verify", action="store_true", default=False, help="Verify codes") diff --git a/examples/config.json b/examples/config.json index 24e570a..00b8ba2 100644 --- a/examples/config.json +++ b/examples/config.json @@ -23,7 +23,7 @@ { "file": "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv", "columns": { - "read2_code": "code", + "read2": "code", "metadata": [ "description" ] diff --git a/examples/config2.json b/examples/config2.json index 8bac894..42578cf 100644 --- a/examples/config2.json +++ b/examples/config2.json @@ -27,7 +27,7 @@ { "file": "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv", "columns": { - "icd10_code": "code", + "icd10": "code", "metadata": [] }, "concept_set": [ @@ -37,7 +37,7 @@ { "file": "Non-attendance codes/res201-did-not-attend-appointment.csv", "columns": { - "read2_code": "code", + "read2": "code", "metadata": [] }, "concept_set": [ @@ -47,4 +47,4 @@ ] } ] -} \ No newline at end of file +} diff --git a/parse.py b/parse.py index 4e97642..85a0c94 100644 --- a/parse.py +++ b/parse.py @@ -10,7 +10,7 @@ from base import raise_ def in_database(codes, db, col): return codes.isin(db[col]) -class Proto_code(): +class Proto(): """ Define checks as list of 3 tuple: (Message, Condition, Process) - Message = The name of the condition (what is printed and logged) @@ -65,11 +65,11 @@ class Proto_code(): print("Verify: ", bcolors.FAIL, (len(conds) - conds.sum()), " FAILED", bcolors.ENDC) return False -class Read2_code(Proto_code): +class Read2(Proto): def __init__(self, file_path=None): super().__init__(file_path) - input_path = trud.TRUD_PROCESSED_DIR / 'read2_code.parquet' + input_path = trud.TRUD_PROCESSED_DIR / 'read2.parquet' if not input_path.is_file(): raise FileNotFoundError(f"Error: Read2 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") self.db = pd.read_parquet(input_path) @@ -95,27 +95,27 @@ class Read2_code(Proto_code): lambda codes : codes.str.match("^[a-zA-Z0-9\.]+$"), lambda codes : log_invalid_code(codes, codes.str.match("^[a-zA-Z0-9\.]+$"), #Log non-matching rows - code_type="read2_code", + code_type="read2", file_path=self.file_path, cause="QA Alphanumeric Dot"), ), ( "In Database", - lambda codes : in_database(codes, self.db, "read2_code"), + lambda codes : in_database(codes, self.db, "read2"), lambda codes : log_invalid_code(codes, - in_database(codes, self.db, "read2_code"), #Log non-matching rows - code_type="read2_code", + in_database(codes, self.db, "read2"), #Log non-matching rows + code_type="read2", file_path=self.file_path, cause="QA In Database"), ), ] -class Read3_code(Proto_code): +class Read3(Proto): def __init__(self, file_path=None): super().__init__(file_path) - input_path = trud.TRUD_PROCESSED_DIR / 'read3_code.parquet' + input_path = trud.TRUD_PROCESSED_DIR / 'read3.parquet' if not input_path.is_file(): raise FileNotFoundError(f"Error: Read3 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") self.db = pd.read_parquet(input_path) @@ -141,26 +141,26 @@ class Read3_code(Proto_code): lambda codes : codes.str.match("^[a-zA-Z0-9\.]+$"), lambda codes : log_invalid_code(codes, codes.str.match("^[a-zA-Z0-9\.]+$"), #Log non-matching rows - code_type="read3_code", + code_type="read3", file_path=self.file_path, cause="QA Alphanumeric Dot"), ), ( "In Database", - lambda codes : in_database(codes, self.db, "read3_code"), + lambda codes : in_database(codes, self.db, "read3"), lambda codes : log_invalid_code(codes, - in_database(codes, self.db, "read3_code"), #Log non-matching rows - code_type="read3_code", + in_database(codes, self.db, "read3"), #Log non-matching rows + code_type="read3", file_path=self.file_path, cause="QA In Database"), ), ] -class Icd10_code(Proto_code): +class Icd10(Proto): def __init__(self, file_path=None): super().__init__(file_path) - input_path = trud.TRUD_PROCESSED_DIR / 'icd10_code.parquet' + input_path = trud.TRUD_PROCESSED_DIR / 'icd10.parquet' if not input_path.is_file(): raise FileNotFoundError(f"Error: ICD10 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") self.db = pd.read_parquet(input_path) @@ -175,7 +175,7 @@ class Icd10_code(Proto_code): lambda codes : ~(codes.str.len() < 3), lambda codes : log_invalid_code(codes, ~(codes.str.len() < 3), #Log non-matching rows - code_type="icd10_code", + code_type="icd10", file_path=self.file_path, cause="QA Too Short"), ), @@ -190,16 +190,16 @@ class Icd10_code(Proto_code): lambda codes : codes.str.match("^[A-Z0-9]+$"), lambda codes : log_invalid_code(codes, codes.str.match("^[A-Z0-9]+$"), #Log non-matching rows - code_type="icd10_code", + code_type="icd10", file_path=self.file_path, cause="QA Alphanumeric Capital"), ), ( "In Database", - lambda codes : ~(~in_database(codes, self.db, "icd10_code") & ~in_database(codes, self.db, "icd10_alt_code")), + lambda codes : ~(~in_database(codes, self.db, "icd10") & ~in_database(codes, self.db, "icd10_alt")), lambda codes : log_invalid_code(codes, #Log non-matching rows - ~(~in_database(codes, self.db, "icd10_code") & ~in_database(codes, self.db, "icd10_alt_code")), - code_type="icd10_code", + ~(~in_database(codes, self.db, "icd10") & ~in_database(codes, self.db, "icd10_alt")), + code_type="icd10", file_path=self.file_path, cause="QA In Database"), ) @@ -208,7 +208,7 @@ class Icd10_code(Proto_code): # lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum* # lambda codes : log_invalid_code(codes, # codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows -# code_type="icd10_code", +# code_type="icd10", # file_path=self.file_path), # ) @@ -219,11 +219,11 @@ class Icd10_code(Proto_code): return codes -class Snomed_code(Proto_code): +class Snomed(Proto): def __init__(self, file_path=None): super().__init__(file_path) - input_path = trud.TRUD_PROCESSED_DIR / 'snomed_code.parquet' + input_path = trud.TRUD_PROCESSED_DIR / 'snomed.parquet' if not input_path.is_file(): raise FileNotFoundError(f"Error: SNOMED code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") self.db = pd.read_parquet(input_path) @@ -236,9 +236,9 @@ class Snomed_code(Proto_code): ( "Too Short", lambda codes : ~(codes.str.len() < 6), - lambda codes : log_invalid_code(codes, #Log non-matching rows + lambda codes : log_invalid(codes, #Log non-matching rows ~(codes.str.len() < 6), - code_type="snomed_code", + code_type="snomed", file_path=self.file_path, cause="QA Too Short"), ), @@ -247,7 +247,7 @@ class Snomed_code(Proto_code): lambda codes : ~(codes.str.len() > 18), lambda codes : log_invalid_code(codes, #Log non-matching rows ~(codes.str.len() > 18), - code_type="snomed_code", + code_type="snomed", file_path=self.file_path, cause="QA Too Long"), ), @@ -256,7 +256,7 @@ class Snomed_code(Proto_code): lambda codes : codes.str.match("[0-9]+$"), lambda codes : log_invalid_code(codes, #Log non-matching rows codes.str.match("[0-9]+$"), - code_type="snomed_code", + code_type="snomed", file_path=self.file_path, cause="QA Numeric"), ), @@ -267,20 +267,20 @@ class Snomed_code(Proto_code): # ), ( "In Database", - lambda codes : in_database(codes, self.db, "snomed_code"), + lambda codes : in_database(codes, self.db, "snomed"), lambda codes : log_invalid_code(codes, #Log non-matching rows - in_database(codes, self.db, "snomed_code"), - code_type="snomed_code", + in_database(codes, self.db, "snomed"), + code_type="snomed", file_path=self.file_path, cause="QA In Database"), ) ] -class Opcs4_code(Proto_code): +class Opcs4(Proto): def __init__(self, file_path=None): super().__init__(file_path) - input_path = trud.TRUD_PROCESSED_DIR / 'opcs4_code.parquet' + input_path = trud.TRUD_PROCESSED_DIR / 'opcs4.parquet' if not input_path.is_file(): raise FileNotFoundError(f"Error: OPCS4 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") self.db = pd.read_parquet(input_path) @@ -292,16 +292,16 @@ class Opcs4_code(Proto_code): ), ( "In Database", - lambda codes : in_database(codes, self.db, "opcs4_code"), + lambda codes : in_database(codes, self.db, "opcs4"), lambda codes : log_invalid_code(codes, #Log non-matching rows - in_database(codes, self.db, "opcs4_code"), - code_type="opcs4_code", + in_database(codes, self.db, "opcs4"), + code_type="opcs4", file_path=self.file_path, cause="QA In Database"), ) ] -class Atc_code(Proto_code): +class Atc(Proto): def __init__(self, file_path=None): super().__init__(file_path) self.checks = [ @@ -315,13 +315,13 @@ class Atc_code(Proto_code): lambda codes : codes.str.match("^[A-Z0-9]+$"), lambda codes : log_invalid_code(codes, #Log non-matching rows codes.str.match("^[A-Z0-9]+$"), - code_type="atc_code", + code_type="atc", file_path=self.file_path, cause="QA Alphanumeric Capital"), ), ] -class Med_code(Proto_code): +class Med(Proto): def __init__(self, file_path=None): super().__init__(file_path) self.checks = [ @@ -332,7 +332,7 @@ class Med_code(Proto_code): ) ] -class Cprd_code(Proto_code): +class Cprd(Proto): def __init__(self, file_path=None): super().__init__(file_path) self.checks = [ @@ -344,23 +344,23 @@ class Cprd_code(Proto_code): ] code_types = { - "read2_code": Read2_code, - "read3_code": Read3_code, - "icd10_code": Icd10_code, - "snomed_code": Snomed_code, - "opcs4_code": Opcs4_code, - "atc_code": Atc_code, - "med_code": Med_code, - "cprd_code": Cprd_code, + "read2": Read2, + "read3": Read3, + "icd10": Icd10, + "snomed": Snomed, + "opcs4": Opcs4, + "atc": Atc, + "med": Med, + "cprd": Cprd, } vocab_types = { - "read2_code": "Read", - "read3_code": None, - "icd10_code": "ICD10CM", - "snomed_code": "SNOMED", - "opcs4_code": "OPCS4", - "atc_code": "ATC", - "med_code": None, - "cprd_code": None, + "read2": "Read", + "read3": None, + "icd10": "ICD10CM", + "snomed": "SNOMED", + "opcs4": "OPCS4", + "atc": "ATC", + "med": None, + "cprd": None, } \ No newline at end of file diff --git a/phen.py b/phen.py index f6c99fe..5b37619 100644 --- a/phen.py +++ b/phen.py @@ -15,7 +15,7 @@ from urllib.parse import urlparse, urlunparse # acmc dependencies import trud from base import log_invalid_code, bcolors, raise_ -from parse import Read2_code, Read3_code, Icd10_code, Snomed_code, Opcs4_code, Atc_code, code_types, vocab_types +from parse import Read2, Read3, Icd10, Snomed, Opcs4, Atc, code_types, vocab_types from omop import OMOP_DB_PATH, publish_concept_sets, setup pd.set_option("mode.chained_assignment", None) diff --git a/trud.py b/trud.py index cf758d2..952bd1d 100644 --- a/trud.py +++ b/trud.py @@ -88,11 +88,11 @@ def extract_icd10(): file_path = TRUD_DOWNLOADS_DIR / 'ICD10_Edition5_XML_20160401' / 'Content' / 'ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml' df = pd.read_xml(file_path) df = df[["CODE", "ALT_CODE", "DESCRIPTION"]] - df = df.rename(columns={"CODE":"icd10_code", - "ALT_CODE":"icd10_alt_code", + df = df.rename(columns={"CODE":"icd10", + "ALT_CODE":"icd10_alt", "DESCRIPTION":"description" }) - output_path = TRUD_PROCESSED_DIR / 'icd10_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'icd10.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") @@ -100,9 +100,9 @@ def extract_opsc4(): file_path = TRUD_DOWNLOADS_DIR / 'OPCS410 Data files txt' / 'OPCS410 CodesAndTitles Nov 2022 V1.0.txt' df = pd.read_csv(file_path, sep='\t', dtype=str, header=None) - df = df.rename(columns={0:"opcs4_code", 1:"description"}) + df = df.rename(columns={0:"opcs4", 1:"description"}) - output_path = TRUD_PROCESSED_DIR / 'opcs4_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'opcs4.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") @@ -113,11 +113,11 @@ def extract_nhs_data_migrations(): file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'sctcremap_uk_20200401000001.txt' df = pd.read_csv(file_path, sep='\t') df = df[["SCT_CONCEPTID"]] - df = df.rename(columns={"SCT_CONCEPTID":"snomed_code"}) + df = df.rename(columns={"SCT_CONCEPTID":"snomed"}) df = df.drop_duplicates() df = df.astype(str) - output_path = TRUD_PROCESSED_DIR / 'snomed_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'snomed.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") @@ -125,10 +125,10 @@ def extract_nhs_data_migrations(): file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rctctv3map_uk_20200401000001.txt' df = pd.read_csv(file_path, sep='\t') df = df[["V2_CONCEPTID", "CTV3_CONCEPTID"]] - df = df.rename(columns={"V2_CONCEPTID":"read2_code", - "CTV3_CONCEPTID":"read3_code"}) + df = df.rename(columns={"V2_CONCEPTID":"read2", + "CTV3_CONCEPTID":"read3"}) - output_path = TRUD_PROCESSED_DIR / 'read2_code_to_read3_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'read2_to_read3.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") @@ -136,12 +136,12 @@ def extract_nhs_data_migrations(): file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3rctmap_uk_20200401000002.txt' df = pd.read_csv(file_path, sep='\t') df = df[["CTV3_CONCEPTID", "V2_CONCEPTID"]] - df = df.rename(columns={"CTV3_CONCEPTID":"read3_code", - "V2_CONCEPTID":"read2_code"}) + df = df.rename(columns={"CTV3_CONCEPTID":"read3", + "V2_CONCEPTID":"read2"}) df = df.drop_duplicates() - df = df[~df["read2_code"].str.match("^.*_.*$")] #remove r2 codes with '_' + df = df[~df["read2"].str.match("^.*_.*$")] #remove r2 codes with '_' - output_path = TRUD_PROCESSED_DIR / 'read3_code_to_read2_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'read3_to_read2.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") @@ -149,10 +149,10 @@ def extract_nhs_data_migrations(): file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rcsctmap2_uk_20200401000001.txt' df = pd.read_csv(file_path, sep='\t', dtype=str) df = df[["ReadCode", "ConceptId"]] - df = df.rename(columns={"ReadCode":"read2_code", - "ConceptId":"snomed_code"}) + df = df.rename(columns={"ReadCode":"read2", + "ConceptId":"snomed"}) - output_path = TRUD_PROCESSED_DIR / 'read2_code_to_snomed_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'read2_to_snomed.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") @@ -160,12 +160,12 @@ def extract_nhs_data_migrations(): file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3sctmap2_uk_20200401000001.txt' df = pd.read_csv(file_path, sep='\t', dtype=str) df = df[["CTV3_TERMID", "SCT_CONCEPTID"]] - df = df.rename(columns={"CTV3_TERMID":"read3_code", - "SCT_CONCEPTID":"snomed_code"}) - df["snomed_code"] = df["snomed_code"].astype(str) - df = df[~df["snomed_code"].str.match("^.*_.*$")] #remove snomed codes with '_' + df = df.rename(columns={"CTV3_TERMID":"read3", + "SCT_CONCEPTID":"snomed"}) + df["snomed"] = df["snomed"].astype(str) + df = df[~df["snomed"].str.match("^.*_.*$")] #remove snomed codes with '_' - output_path = TRUD_PROCESSED_DIR / 'read3_code_to_snomed_code.parquet' + output_path = TRUD_PROCESSED_DIR / 'read3_to_snomed.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") @@ -175,8 +175,8 @@ def extract_nhs_read_browser(): df = simpledbf.Dbf5(input_path).to_dataframe() df = pd.concat([df['READCODE'], df['DESCENDANT']]) df = pd.DataFrame(df.drop_duplicates()) - df = df.rename(columns={0:"read2_code"}) - output_path = TRUD_PROCESSED_DIR / 'read2_code.parquet' + df = df.rename(columns={0:"read2"}) + output_path = TRUD_PROCESSED_DIR / 'read2.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") @@ -184,8 +184,8 @@ def extract_nhs_read_browser(): input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ATC.DBF' df = simpledbf.Dbf5(input_path).to_dataframe() df = df[["READCODE", "ATC"]] - df = df.rename(columns={"READCODE":"read2_code", "ATC":"atc_code"}) - output_path = TRUD_PROCESSED_DIR / 'read2_code_to_atc_code.parquet' + df = df.rename(columns={"READCODE":"read2", "ATC":"atc"}) + output_path = TRUD_PROCESSED_DIR / 'read2_to_atc.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") @@ -193,10 +193,10 @@ def extract_nhs_read_browser(): input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ICD10.DBF' df = simpledbf.Dbf5(input_path).to_dataframe() df = df[["READ_CODE", "TARG_CODE"]] - df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"icd10_code"}) - df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-' - df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-' - output_path = TRUD_PROCESSED_DIR / 'read2_code_to_icd10_code.parquet' + df = df.rename(columns={"READ_CODE":"read2", "TARG_CODE":"icd10"}) + df = df[~df["icd10"].str.match("^.*-.*$")] #remove codes with '-' + df = df[~df["read2"].str.match("^.*-.*$")] #remove codes with '-' + output_path = TRUD_PROCESSED_DIR / 'read2_to_icd10.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") @@ -204,10 +204,10 @@ def extract_nhs_read_browser(): input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'OPCS4V3.DBF' df = simpledbf.Dbf5(input_path).to_dataframe() df = df[["READ_CODE", "TARG_CODE"]] - df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"opcs4_code"}) - df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-' - df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-' - output_path = TRUD_PROCESSED_DIR / 'read2_code_to_opcs4_code.parquet' + df = df.rename(columns={"READ_CODE":"read2", "TARG_CODE":"opcs4"}) + df = df[~df["opcs4"].str.match("^.*-.*$")] #remove codes with '-' + df = df[~df["read2"].str.match("^.*-.*$")] #remove codes with '-' + output_path = TRUD_PROCESSED_DIR / 'read2_to_opcs4.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") @@ -216,8 +216,8 @@ def extract_nhs_read_browser(): df = simpledbf.Dbf5(input_path).to_dataframe() df = pd.concat([df['READCODE'], df['DESCENDANT']]) df = pd.DataFrame(df.drop_duplicates()) - df = df.rename(columns={0:"read3_code"}) - output_path = TRUD_PROCESSED_DIR / 'read3_code.parquet' + df = df.rename(columns={0:"read3"}) + output_path = TRUD_PROCESSED_DIR / 'read3.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") @@ -225,10 +225,10 @@ def extract_nhs_read_browser(): input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ICD10.DBF' df = simpledbf.Dbf5(input_path).to_dataframe() df = df[["READ_CODE", "TARG_CODE"]] - df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"icd10_code"}) - df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-' - df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-' - output_path = TRUD_PROCESSED_DIR / 'read3_code_to_icd10_code.parquet' + df = df.rename(columns={"READ_CODE":"read3", "TARG_CODE":"icd10"}) + df = df[~df["icd10"].str.match("^.*-.*$")] #remove codes with '-' + df = df[~df["read3"].str.match("^.*-.*$")] #remove codes with '-' + output_path = TRUD_PROCESSED_DIR / 'read3_to_icd10.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") @@ -239,10 +239,10 @@ def extract_nhs_read_browser(): input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V3' / 'OPCS4V3.DBF' df = simpledbf.Dbf5(input_path).to_dataframe() df = df[["READ_CODE", "TARG_CODE"]] - df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"opcs4_code"}) - df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-' - df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-' - output_path = TRUD_PROCESSED_DIR / 'read3_code_to_opcs4_code.parquet' + df = df.rename(columns={"READ_CODE":"read3", "TARG_CODE":"opcs4"}) + df = df[~df["opcs4"].str.match("^.*-.*$")] #remove codes with '-' + df = df[~df["read3"].str.match("^.*-.*$")] #remove codes with '-' + output_path = TRUD_PROCESSED_DIR / 'read3_to_opcs4.parquet' df.to_parquet(output_path, index=False) print(f"Extracted: {output_path}") -- GitLab