From c87d14118c99056954e7d91311da373277f8a0ae Mon Sep 17 00:00:00 2001
From: Michael Boniface <m.j.boniface@soton.ac.uk>
Date: Tue, 18 Feb 2025 14:51:00 +0000
Subject: [PATCH] shorted all the _code references removing _code as not needed
 and maps API and configuration easier

---
 acmc.py               |   2 +-
 examples/config.json  |   2 +-
 examples/config2.json |   6 +--
 parse.py              | 110 +++++++++++++++++++++---------------------
 phen.py               |   2 +-
 trud.py               |  88 ++++++++++++++++-----------------
 6 files changed, 105 insertions(+), 105 deletions(-)

diff --git a/acmc.py b/acmc.py
index e79b7f9..8d36f9d 100644
--- a/acmc.py
+++ b/acmc.py
@@ -109,7 +109,7 @@ def main():
 	# phen map
 	phen_map_parser = phen_subparsers.add_parser("map", help="Process phen mapping")
 	phen_map_parser.add_argument("-d", "--phen-dir", type=str, default=str(phen.DEFAULT_PHEN_PATH.resolve()), help="Phenotype directory")
-	phen_map_parser.add_argument("-t", "--target-coding", required=True, choices=['read2_code', 'read3_code', 'icd10_code', 'snomed_code', 'opcs4_code'], help="Specify the target coding (read2_code, read3_code, icd10_code, snomed_code, opcs4_code)")
+	phen_map_parser.add_argument("-t", "--target-coding", required=True, choices=['read2', 'read3', 'icd10', 'snomed', 'opcs4'], help="Specify the target coding (read2, read3, icd10, snomed, opcs4)")
 	# phen map flags
 	phen_map_parser.add_argument("-tr", "--translate", action="store_true", default=False, help="Translate code types")
 	phen_map_parser.add_argument("-v", "--verify", action="store_true", default=False, help="Verify codes")
diff --git a/examples/config.json b/examples/config.json
index 24e570a..00b8ba2 100644
--- a/examples/config.json
+++ b/examples/config.json
@@ -23,7 +23,7 @@
                 {
                     "file": "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv",
                     "columns": {
-                        "read2_code": "code",
+                        "read2": "code",
                         "metadata": [
                             "description"
                         ]
diff --git a/examples/config2.json b/examples/config2.json
index 8bac894..42578cf 100644
--- a/examples/config2.json
+++ b/examples/config2.json
@@ -27,7 +27,7 @@
                 {
                     "file": "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv",
                     "columns": {
-                        "icd10_code": "code",
+                        "icd10": "code",
                         "metadata": []
                     },
                     "concept_set": [
@@ -37,7 +37,7 @@
                 {
                     "file": "Non-attendance codes/res201-did-not-attend-appointment.csv",
                     "columns": {
-                        "read2_code": "code",
+                        "read2": "code",
                         "metadata": []
                     },
                     "concept_set": [
@@ -47,4 +47,4 @@
             ]
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/parse.py b/parse.py
index 4e97642..85a0c94 100644
--- a/parse.py
+++ b/parse.py
@@ -10,7 +10,7 @@ from base import raise_
 def in_database(codes, db, col):
     return codes.isin(db[col])
 
-class Proto_code():
+class Proto():
 	"""
 	Define checks as list of 3 tuple: (Message, Condition, Process)
 	- Message = The name of the condition (what is printed and logged)
@@ -65,11 +65,11 @@ class Proto_code():
 			print("Verify: ", bcolors.FAIL, (len(conds) - conds.sum()), " FAILED", bcolors.ENDC)
 			return False
 	
-class Read2_code(Proto_code):
+class Read2(Proto):
     def __init__(self, file_path=None):
         super().__init__(file_path)
 		
-        input_path = trud.TRUD_PROCESSED_DIR / 'read2_code.parquet'
+        input_path = trud.TRUD_PROCESSED_DIR / 'read2.parquet'
         if not input_path.is_file():  
             raise FileNotFoundError(f"Error: Read2 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")   
         self.db = pd.read_parquet(input_path)
@@ -95,27 +95,27 @@ class Read2_code(Proto_code):
 				lambda codes : codes.str.match("^[a-zA-Z0-9\.]+$"),
 				lambda codes : log_invalid_code(codes,
 												codes.str.match("^[a-zA-Z0-9\.]+$"), #Log non-matching rows
-												code_type="read2_code",
+												code_type="read2",
 												file_path=self.file_path,
 												cause="QA Alphanumeric Dot"),
 			),
 			(
 				"In Database",
-				lambda codes : in_database(codes, self.db, "read2_code"),
+				lambda codes : in_database(codes, self.db, "read2"),
 				lambda codes : log_invalid_code(codes,
-												in_database(codes, self.db, "read2_code"), #Log non-matching rows
-												code_type="read2_code",
+												in_database(codes, self.db, "read2"), #Log non-matching rows
+												code_type="read2",
 												file_path=self.file_path,
 												cause="QA In Database"),
 			),
 
 		]
 	
-class Read3_code(Proto_code):
+class Read3(Proto):
 	def __init__(self, file_path=None):
 		super().__init__(file_path)
 
-		input_path = trud.TRUD_PROCESSED_DIR / 'read3_code.parquet'
+		input_path = trud.TRUD_PROCESSED_DIR / 'read3.parquet'
 		if not input_path.is_file():  
 			raise FileNotFoundError(f"Error: Read3 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")               
 		self.db = pd.read_parquet(input_path)
@@ -141,26 +141,26 @@ class Read3_code(Proto_code):
 				lambda codes : codes.str.match("^[a-zA-Z0-9\.]+$"),
 				lambda codes : log_invalid_code(codes,
 												codes.str.match("^[a-zA-Z0-9\.]+$"), #Log non-matching rows
-												code_type="read3_code",
+												code_type="read3",
 												file_path=self.file_path, 
 												cause="QA Alphanumeric Dot"),
 			),
 			(
 				"In Database",
-				lambda codes : in_database(codes, self.db, "read3_code"),
+				lambda codes : in_database(codes, self.db, "read3"),
 				lambda codes : log_invalid_code(codes,
-												in_database(codes, self.db, "read3_code"), #Log non-matching rows
-												code_type="read3_code",
+												in_database(codes, self.db, "read3"), #Log non-matching rows
+												code_type="read3",
 												file_path=self.file_path,
 											    cause="QA In Database"),
 			),
 		]
 	
-class Icd10_code(Proto_code):
+class Icd10(Proto):
 	def __init__(self, file_path=None):
 		super().__init__(file_path)
 
-		input_path = trud.TRUD_PROCESSED_DIR / 'icd10_code.parquet'
+		input_path = trud.TRUD_PROCESSED_DIR / 'icd10.parquet'
 		if not input_path.is_file():  
 			raise FileNotFoundError(f"Error: ICD10 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")               
 		self.db = pd.read_parquet(input_path)
@@ -175,7 +175,7 @@ class Icd10_code(Proto_code):
 				lambda codes : ~(codes.str.len() < 3),
 				lambda codes : log_invalid_code(codes,
 												~(codes.str.len() < 3), #Log non-matching rows
-												code_type="icd10_code",
+												code_type="icd10",
 												file_path=self.file_path, 
 												cause="QA Too Short"),
 			),
@@ -190,16 +190,16 @@ class Icd10_code(Proto_code):
 				lambda codes : codes.str.match("^[A-Z0-9]+$"),
 				lambda codes : log_invalid_code(codes,
 												codes.str.match("^[A-Z0-9]+$"), #Log non-matching rows
-												code_type="icd10_code",
+												code_type="icd10",
 												file_path=self.file_path, 
 											    cause="QA Alphanumeric Capital"),
 			),
 			(
 				"In Database",
-				lambda codes : ~(~in_database(codes, self.db, "icd10_code") & ~in_database(codes, self.db, "icd10_alt_code")),
+				lambda codes : ~(~in_database(codes, self.db, "icd10") & ~in_database(codes, self.db, "icd10_alt")),
 				lambda codes : log_invalid_code(codes, #Log non-matching rows
-												~(~in_database(codes, self.db, "icd10_code") & ~in_database(codes, self.db, "icd10_alt_code")), 
-												code_type="icd10_code",
+												~(~in_database(codes, self.db, "icd10") & ~in_database(codes, self.db, "icd10_alt")), 
+												code_type="icd10",
 												file_path=self.file_path,
 												cause="QA In Database"),
 			)
@@ -208,7 +208,7 @@ class Icd10_code(Proto_code):
 # 				lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum*
 # 				lambda codes : log_invalid_code(codes,
 # 												codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows
-# 												code_type="icd10_code",
+# 												code_type="icd10",
 # 												file_path=self.file_path),
 
 # 			)
@@ -219,11 +219,11 @@ class Icd10_code(Proto_code):
 		return codes
 		
 	
-class Snomed_code(Proto_code):
+class Snomed(Proto):
 	def __init__(self, file_path=None):
 		super().__init__(file_path)
 
-		input_path = trud.TRUD_PROCESSED_DIR / 'snomed_code.parquet'
+		input_path = trud.TRUD_PROCESSED_DIR / 'snomed.parquet'
 		if not input_path.is_file():  
 			raise FileNotFoundError(f"Error: SNOMED code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")               
 		self.db = pd.read_parquet(input_path)        
@@ -236,9 +236,9 @@ class Snomed_code(Proto_code):
 			(
 				"Too Short",
 				lambda codes : ~(codes.str.len() < 6),
-				lambda codes : log_invalid_code(codes, #Log non-matching rows
+				lambda codes : log_invalid(codes, #Log non-matching rows
 												~(codes.str.len() < 6), 
-												code_type="snomed_code",
+												code_type="snomed",
 												file_path=self.file_path,
 											    cause="QA Too Short"),
 			),
@@ -247,7 +247,7 @@ class Snomed_code(Proto_code):
 				lambda codes : ~(codes.str.len() > 18),
 				lambda codes : log_invalid_code(codes, #Log non-matching rows
 												~(codes.str.len() > 18), 
-												code_type="snomed_code",
+												code_type="snomed",
 												file_path=self.file_path,
 											    cause="QA Too Long"),
 			),
@@ -256,7 +256,7 @@ class Snomed_code(Proto_code):
 				lambda codes : codes.str.match("[0-9]+$"),
 				lambda codes : log_invalid_code(codes, #Log non-matching rows
 												codes.str.match("[0-9]+$"),
-												code_type="snomed_code",
+												code_type="snomed",
 												file_path=self.file_path,
 											    cause="QA Numeric"),
 			),
@@ -267,20 +267,20 @@ class Snomed_code(Proto_code):
 			# ),
 			(
 				"In Database",
-				lambda codes : in_database(codes, self.db, "snomed_code"),
+				lambda codes : in_database(codes, self.db, "snomed"),
 				lambda codes : log_invalid_code(codes, #Log non-matching rows
-												in_database(codes, self.db, "snomed_code"), 
-												code_type="snomed_code",
+												in_database(codes, self.db, "snomed"), 
+												code_type="snomed",
 												file_path=self.file_path,
 											    cause="QA In Database"),
 			)
 		]
 
-class Opcs4_code(Proto_code):
+class Opcs4(Proto):
 	def __init__(self, file_path=None):
 		super().__init__(file_path)
 
-		input_path = trud.TRUD_PROCESSED_DIR / 'opcs4_code.parquet'
+		input_path = trud.TRUD_PROCESSED_DIR / 'opcs4.parquet'
 		if not input_path.is_file():  
 			raise FileNotFoundError(f"Error: OPCS4 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")               
 		self.db = pd.read_parquet(input_path)          
@@ -292,16 +292,16 @@ class Opcs4_code(Proto_code):
 			),
 			(
 				"In Database",
-				lambda codes : in_database(codes, self.db, "opcs4_code"),
+				lambda codes : in_database(codes, self.db, "opcs4"),
 				lambda codes : log_invalid_code(codes, #Log non-matching rows
-												in_database(codes, self.db, "opcs4_code"), 
-												code_type="opcs4_code",
+												in_database(codes, self.db, "opcs4"), 
+												code_type="opcs4",
 												file_path=self.file_path,
 											    cause="QA In Database"),
 			)
 		]
 
-class Atc_code(Proto_code):
+class Atc(Proto):
 	def __init__(self, file_path=None):
 		super().__init__(file_path)
 		self.checks = [
@@ -315,13 +315,13 @@ class Atc_code(Proto_code):
 				lambda codes : codes.str.match("^[A-Z0-9]+$"),
 				lambda codes : log_invalid_code(codes, #Log non-matching rows
 												codes.str.match("^[A-Z0-9]+$"), 
-												code_type="atc_code",
+												code_type="atc",
 												file_path=self.file_path,
 											    cause="QA Alphanumeric Capital"),
 			),
 		]
 		
-class Med_code(Proto_code):
+class Med(Proto):
 	def __init__(self, file_path=None):
 		super().__init__(file_path)
 		self.checks = [
@@ -332,7 +332,7 @@ class Med_code(Proto_code):
 			)
 		]
 		
-class Cprd_code(Proto_code):
+class Cprd(Proto):
 	def __init__(self, file_path=None):
 		super().__init__(file_path)
 		self.checks = [
@@ -344,23 +344,23 @@ class Cprd_code(Proto_code):
 		]
 		
 code_types = {
-	"read2_code": Read2_code,
-	"read3_code": Read3_code,
-	"icd10_code": Icd10_code,
-	"snomed_code": Snomed_code,
-	"opcs4_code": Opcs4_code,
-	"atc_code": Atc_code,
-	"med_code": Med_code,
-	"cprd_code": Cprd_code,
+	"read2": Read2,
+	"read3": Read3,
+	"icd10": Icd10,
+	"snomed": Snomed,
+	"opcs4": Opcs4,
+	"atc": Atc,
+	"med": Med,
+	"cprd": Cprd,
 }
 
 vocab_types = {
-	"read2_code": "Read",
-	"read3_code": None,
-	"icd10_code": "ICD10CM",
-	"snomed_code": "SNOMED",
-	"opcs4_code": "OPCS4",
-	"atc_code": "ATC",
-	"med_code": None,
-	"cprd_code": None,
+	"read2": "Read",
+	"read3": None,
+	"icd10": "ICD10CM",
+	"snomed": "SNOMED",
+	"opcs4": "OPCS4",
+	"atc": "ATC",
+	"med": None,
+	"cprd": None,
 }
\ No newline at end of file
diff --git a/phen.py b/phen.py
index f6c99fe..5b37619 100644
--- a/phen.py
+++ b/phen.py
@@ -15,7 +15,7 @@ from urllib.parse import urlparse, urlunparse
 # acmc dependencies 
 import trud
 from base import log_invalid_code, bcolors, raise_
-from parse import Read2_code, Read3_code, Icd10_code, Snomed_code, Opcs4_code, Atc_code, code_types, vocab_types
+from parse import Read2, Read3, Icd10, Snomed, Opcs4, Atc, code_types, vocab_types
 from omop import OMOP_DB_PATH, publish_concept_sets, setup
 
 pd.set_option("mode.chained_assignment", None)
diff --git a/trud.py b/trud.py
index cf758d2..952bd1d 100644
--- a/trud.py
+++ b/trud.py
@@ -88,11 +88,11 @@ def extract_icd10():
     file_path = TRUD_DOWNLOADS_DIR / 'ICD10_Edition5_XML_20160401' / 'Content' / 'ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml'
     df = pd.read_xml(file_path)
     df = df[["CODE", "ALT_CODE", "DESCRIPTION"]]
-    df = df.rename(columns={"CODE":"icd10_code",
-                            "ALT_CODE":"icd10_alt_code",
+    df = df.rename(columns={"CODE":"icd10",
+                            "ALT_CODE":"icd10_alt",
                             "DESCRIPTION":"description"
                         })
-    output_path = TRUD_PROCESSED_DIR / 'icd10_code.parquet'
+    output_path = TRUD_PROCESSED_DIR / 'icd10.parquet'
     df.to_parquet(output_path, index=False)
     print(f"Extracted: {output_path}")
 
@@ -100,9 +100,9 @@ def extract_opsc4():
     file_path = TRUD_DOWNLOADS_DIR / 'OPCS410 Data files txt' / 'OPCS410 CodesAndTitles Nov 2022 V1.0.txt'
     
     df = pd.read_csv(file_path, sep='\t', dtype=str, header=None)
-    df = df.rename(columns={0:"opcs4_code", 1:"description"})
+    df = df.rename(columns={0:"opcs4", 1:"description"})
     
-    output_path = TRUD_PROCESSED_DIR / 'opcs4_code.parquet'   
+    output_path = TRUD_PROCESSED_DIR / 'opcs4.parquet'   
     df.to_parquet(output_path, index=False)
     print(f"Extracted: {output_path}")
 
@@ -113,11 +113,11 @@ def extract_nhs_data_migrations():
     file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'sctcremap_uk_20200401000001.txt'    
     df = pd.read_csv(file_path, sep='\t')    
     df = df[["SCT_CONCEPTID"]]
-    df = df.rename(columns={"SCT_CONCEPTID":"snomed_code"})
+    df = df.rename(columns={"SCT_CONCEPTID":"snomed"})
     df = df.drop_duplicates()
     df = df.astype(str)
 
-    output_path = TRUD_PROCESSED_DIR / 'snomed_code.parquet'    
+    output_path = TRUD_PROCESSED_DIR / 'snomed.parquet'    
     df.to_parquet(output_path, index=False)
     print(f"Extracted: {output_path}")
     
@@ -125,10 +125,10 @@ def extract_nhs_data_migrations():
     file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rctctv3map_uk_20200401000001.txt'
     df = pd.read_csv(file_path, sep='\t')
     df = df[["V2_CONCEPTID", "CTV3_CONCEPTID"]]
-    df = df.rename(columns={"V2_CONCEPTID":"read2_code",
-                            "CTV3_CONCEPTID":"read3_code"})
+    df = df.rename(columns={"V2_CONCEPTID":"read2",
+                            "CTV3_CONCEPTID":"read3"})
 
-    output_path = TRUD_PROCESSED_DIR / 'read2_code_to_read3_code.parquet'   
+    output_path = TRUD_PROCESSED_DIR / 'read2_to_read3.parquet'   
     df.to_parquet(output_path, index=False)
     print(f"Extracted: {output_path}")
 
@@ -136,12 +136,12 @@ def extract_nhs_data_migrations():
     file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3rctmap_uk_20200401000002.txt'
     df = pd.read_csv(file_path, sep='\t')
     df = df[["CTV3_CONCEPTID", "V2_CONCEPTID"]]
-    df = df.rename(columns={"CTV3_CONCEPTID":"read3_code", 
-                            "V2_CONCEPTID":"read2_code"})
+    df = df.rename(columns={"CTV3_CONCEPTID":"read3", 
+                            "V2_CONCEPTID":"read2"})
     df = df.drop_duplicates()
-    df = df[~df["read2_code"].str.match("^.*_.*$")] #remove r2 codes with '_'
+    df = df[~df["read2"].str.match("^.*_.*$")] #remove r2 codes with '_'
 
-    output_path = TRUD_PROCESSED_DIR / 'read3_code_to_read2_code.parquet'   
+    output_path = TRUD_PROCESSED_DIR / 'read3_to_read2.parquet'   
     df.to_parquet(output_path, index=False)
     print(f"Extracted: {output_path}")
     
@@ -149,10 +149,10 @@ def extract_nhs_data_migrations():
     file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rcsctmap2_uk_20200401000001.txt'
     df = pd.read_csv(file_path, sep='\t', dtype=str)
     df = df[["ReadCode", "ConceptId"]]
-    df = df.rename(columns={"ReadCode":"read2_code",
-                            "ConceptId":"snomed_code"})
+    df = df.rename(columns={"ReadCode":"read2",
+                            "ConceptId":"snomed"})
 
-    output_path = TRUD_PROCESSED_DIR / 'read2_code_to_snomed_code.parquet'   
+    output_path = TRUD_PROCESSED_DIR / 'read2_to_snomed.parquet'   
     df.to_parquet(output_path, index=False)
     print(f"Extracted: {output_path}")
 
@@ -160,12 +160,12 @@ def extract_nhs_data_migrations():
     file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3sctmap2_uk_20200401000001.txt'
     df = pd.read_csv(file_path, sep='\t', dtype=str)
     df = df[["CTV3_TERMID", "SCT_CONCEPTID"]]
-    df = df.rename(columns={"CTV3_TERMID":"read3_code",
-                            "SCT_CONCEPTID":"snomed_code"})
-    df["snomed_code"] = df["snomed_code"].astype(str)
-    df = df[~df["snomed_code"].str.match("^.*_.*$")] #remove snomed codes with '_'
+    df = df.rename(columns={"CTV3_TERMID":"read3",
+                            "SCT_CONCEPTID":"snomed"})
+    df["snomed"] = df["snomed"].astype(str)
+    df = df[~df["snomed"].str.match("^.*_.*$")] #remove snomed codes with '_'
 
-    output_path = TRUD_PROCESSED_DIR / 'read3_code_to_snomed_code.parquet'   
+    output_path = TRUD_PROCESSED_DIR / 'read3_to_snomed.parquet'   
     df.to_parquet(output_path, index=False)
     print(f"Extracted: {output_path}")    
 
@@ -175,8 +175,8 @@ def extract_nhs_read_browser():
     df = simpledbf.Dbf5(input_path).to_dataframe()
     df = pd.concat([df['READCODE'], df['DESCENDANT']])
     df = pd.DataFrame(df.drop_duplicates())
-    df = df.rename(columns={0:"read2_code"})
-    output_path = TRUD_PROCESSED_DIR / 'read2_code.parquet'   
+    df = df.rename(columns={0:"read2"})
+    output_path = TRUD_PROCESSED_DIR / 'read2.parquet'   
     df.to_parquet(output_path, index=False)
     print(f"Extracted: {output_path}")    
 
@@ -184,8 +184,8 @@ def extract_nhs_read_browser():
     input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ATC.DBF'   
     df = simpledbf.Dbf5(input_path).to_dataframe()
     df = df[["READCODE", "ATC"]]
-    df = df.rename(columns={"READCODE":"read2_code", "ATC":"atc_code"})
-    output_path = TRUD_PROCESSED_DIR / 'read2_code_to_atc_code.parquet'   
+    df = df.rename(columns={"READCODE":"read2", "ATC":"atc"})
+    output_path = TRUD_PROCESSED_DIR / 'read2_to_atc.parquet'   
     df.to_parquet(output_path, index=False)
     print(f"Extracted: {output_path}")        
 
@@ -193,10 +193,10 @@ def extract_nhs_read_browser():
     input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ICD10.DBF'      
     df = simpledbf.Dbf5(input_path).to_dataframe()        
     df = df[["READ_CODE", "TARG_CODE"]]
-    df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"icd10_code"})
-    df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-'
-    df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-'
-    output_path = TRUD_PROCESSED_DIR / 'read2_code_to_icd10_code.parquet'   
+    df = df.rename(columns={"READ_CODE":"read2", "TARG_CODE":"icd10"})
+    df = df[~df["icd10"].str.match("^.*-.*$")] #remove codes with '-'
+    df = df[~df["read2"].str.match("^.*-.*$")] #remove codes with '-'
+    output_path = TRUD_PROCESSED_DIR / 'read2_to_icd10.parquet'   
     df.to_parquet(output_path, index=False)
     print(f"Extracted: {output_path}")      
 
@@ -204,10 +204,10 @@ def extract_nhs_read_browser():
     input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'OPCS4V3.DBF'  
     df = simpledbf.Dbf5(input_path).to_dataframe()        
     df = df[["READ_CODE", "TARG_CODE"]]
-    df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"opcs4_code"})
-    df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-'
-    df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-'
-    output_path = TRUD_PROCESSED_DIR / 'read2_code_to_opcs4_code.parquet'   
+    df = df.rename(columns={"READ_CODE":"read2", "TARG_CODE":"opcs4"})
+    df = df[~df["opcs4"].str.match("^.*-.*$")] #remove codes with '-'
+    df = df[~df["read2"].str.match("^.*-.*$")] #remove codes with '-'
+    output_path = TRUD_PROCESSED_DIR / 'read2_to_opcs4.parquet'   
     df.to_parquet(output_path, index=False)
     print(f"Extracted: {output_path}")      
 
@@ -216,8 +216,8 @@ def extract_nhs_read_browser():
     df = simpledbf.Dbf5(input_path).to_dataframe()    
     df = pd.concat([df['READCODE'], df['DESCENDANT']])
     df = pd.DataFrame(df.drop_duplicates())
-    df = df.rename(columns={0:"read3_code"})
-    output_path = TRUD_PROCESSED_DIR / 'read3_code.parquet'   
+    df = df.rename(columns={0:"read3"})
+    output_path = TRUD_PROCESSED_DIR / 'read3.parquet'   
     df.to_parquet(output_path, index=False)
     print(f"Extracted: {output_path}")     
 
@@ -225,10 +225,10 @@ def extract_nhs_read_browser():
     input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ICD10.DBF'        
     df = simpledbf.Dbf5(input_path).to_dataframe()
     df = df[["READ_CODE", "TARG_CODE"]]
-    df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"icd10_code"})
-    df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-'
-    df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-'
-    output_path = TRUD_PROCESSED_DIR / 'read3_code_to_icd10_code.parquet'   
+    df = df.rename(columns={"READ_CODE":"read3", "TARG_CODE":"icd10"})
+    df = df[~df["icd10"].str.match("^.*-.*$")] #remove codes with '-'
+    df = df[~df["read3"].str.match("^.*-.*$")] #remove codes with '-'
+    output_path = TRUD_PROCESSED_DIR / 'read3_to_icd10.parquet'   
     df.to_parquet(output_path, index=False)
     print(f"Extracted: {output_path}")  
 
@@ -239,10 +239,10 @@ def extract_nhs_read_browser():
     input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V3' / 'OPCS4V3.DBF'      
     df = simpledbf.Dbf5(input_path).to_dataframe()
     df = df[["READ_CODE", "TARG_CODE"]]
-    df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"opcs4_code"})
-    df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-'
-    df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-'
-    output_path = TRUD_PROCESSED_DIR / 'read3_code_to_opcs4_code.parquet'   
+    df = df.rename(columns={"READ_CODE":"read3", "TARG_CODE":"opcs4"})
+    df = df[~df["opcs4"].str.match("^.*-.*$")] #remove codes with '-'
+    df = df[~df["read3"].str.match("^.*-.*$")] #remove codes with '-'
+    output_path = TRUD_PROCESSED_DIR / 'read3_to_opcs4.parquet'   
     df.to_parquet(output_path, index=False)
     print(f"Extracted: {output_path}")      
 
-- 
GitLab