Skip to content
Snippets Groups Projects
Select Git revision
  • 8443e4d9dba74e207d66b696b0419c17543963f6
  • master default protected
2 results

Window.java

Blame
  • parse.py 10.89 KiB
    import pandas as pd
    import numpy as np
    import os
    
    from base import log_invalid_code
    from base import bcolors
    from base import raise_
    		
    def in_database(codes, db, col):
    	return codes.isin(db[col])
    
    class Proto_code():
    	"""
    	Define checks as list of 3 tuple: (Message, Condition, Process)
    	- Message = The name of the condition (what is printed and logged)
    	- Condition = True if Passed, and False if Failed
    	- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
    	"""
    	checks = [
    		(
    			"Not Empty",
    			lambda codes : pd.Series([len(codes) > 0]), #Should be true if passed
    			lambda codes : raise_(Exception("Code List is Empty")) #correts code, or logs, or throws error
    		)
    	]
    	
    	def __init__(self, file_path=None):
    		self.file_path = file_path
    	
    	def process(self, codes):
    		"""
    		identify issues that do not pass and fix them with defined process
    		"""
    		# codes = codes.dropna()
    		for msg, cond, process in self.checks: #run each check
    			if not cond(codes).all(): #if test failed
    				# print("Check:", msg, bcolors.FAIL+"FAILED"+bcolors.ENDC)
    				print("Check: ", msg, bcolors.FAIL+f"{(~cond(codes)).sum()} FAILED"+bcolors.ENDC,)
    				codes = process(codes) #run process to fix issue
    				if cond(codes).all(): #is resloved by process
    					print("Check:", msg, "is resolved")
    				else: #not resolved by process
    					raise Exception(f"ERROR: Check {msg} is NOT resolved")
    			else:
    				print("Check:", msg, bcolors.OKGREEN+"PASSED"+bcolors.ENDC)
    		return codes
    	
    	def verify(self, codes): 
    		"""
    		verify all identified issues pass
    		"""
    		conds = np.array([])
    		for msg, cond, process in self.checks: #run each check
    			out = cond(codes)
    			conds = np.append(conds, out.all())
    			if not out.all(): #if test failed
    				print("Verify:", msg, bcolors.FAIL+"FAILED"+bcolors.ENDC)
    				print(codes[out]) #show failed codes
    				
    		if conds.all(): #check all have passed
    			print(f"Verify: {bcolors.OKGREEN}ALL PASSED{bcolors.ENDC}")
    			return True
    		else: #not all have passed
    			print("Verify: ", bcolors.FAIL, (len(conds) - conds.sum()), " FAILED", bcolors.ENDC)
    			return False
    	
    class Read2_code(Proto_code):
    	def __init__(self, file_path=None):
    		super().__init__(file_path)
    		self.db = pd.read_parquet("maps/processed/read2_code.parquet")
    		self.arg_small = "-r2"
    		self.arg_long = "--read2-code"
    		self.arg_help = "Read V2 Codes Column name in Source File"
    		self.checks = [
    			(
    				"Not Empty",
    				lambda codes : pd.Series([len(codes) > 0]),
    				lambda codes : raise_(Exception("Code List is Empty"))
    			),
    			(
    				"Too Short",
    				lambda codes : ~(codes.str.len() < 5),
    				lambda codes : codes.str.pad(width=5, side='right', fillchar='.')
    			),
    			(
    				"Too Long",
    				lambda codes : ~(codes.str.len() > 5),
    				lambda codes : codes.str[:5]
    			),
    			(
    				"Alphanumeric Dot",
    				lambda codes : codes.str.match("^[a-zA-Z0-9\.]+$"),
    				lambda codes : log_invalid_code(codes,
    												codes.str.match("^[a-zA-Z0-9\.]+$"), #Log non-matching rows
    												code_type="read2_code",
    												file_path=self.file_path,
    												cause="QA Alphanumeric Dot"),
    			),
    			(
    				"In Database",
    				lambda codes : in_database(codes, self.db, "read2_code"),
    				lambda codes : log_invalid_code(codes,
    												in_database(codes, self.db, "read2_code"), #Log non-matching rows
    												code_type="read2_code",
    												file_path=self.file_path,
    												cause="QA In Database"),
    			),
    
    		]
    	
    class Read3_code(Proto_code):
    	def __init__(self, file_path=None):
    		super().__init__(file_path)
    		self.arg_small = "-r3"
    		self.arg_long = "--read3-code"
    		self.arg_help = "Read V3 Codes Column name in Source File"
    		self.db = pd.read_parquet("maps/processed/read3_code.parquet")
    		self.checks = [
    			(
    				"Not Empty",
    				lambda codes : pd.Series([len(codes) > 0]),
    				lambda codes : raise_(Exception("Code List is Empty"))
    			),
    			(
    				"Too Short",
    				lambda codes : ~(codes.str.len() < 5),
    				lambda codes : codes.str.pad(width=5, side='right', fillchar='.')
    			),
    			(
    				"Too Long",
    				lambda codes : ~(codes.str.len() > 5),
    				lambda codes : codes.str[:5]
    			),
    			(
    				"Alphanumeric Dot",
    				lambda codes : codes.str.match("^[a-zA-Z0-9\.]+$"),
    				lambda codes : log_invalid_code(codes,
    												codes.str.match("^[a-zA-Z0-9\.]+$"), #Log non-matching rows
    												code_type="read3_code",
    												file_path=self.file_path, 
    												cause="QA Alphanumeric Dot"),
    			),
    			(
    				"In Database",
    				lambda codes : in_database(codes, self.db, "read3_code"),
    				lambda codes : log_invalid_code(codes,
    												in_database(codes, self.db, "read3_code"), #Log non-matching rows
    												code_type="read3_code",
    												file_path=self.file_path,
    											    cause="QA In Database"),
    			),
    		]
    	
    class Icd10_code(Proto_code):
    	def __init__(self, file_path=None):
    		super().__init__(file_path)
    		self.arg_small = "-i"
    		self.arg_long = "--icd10-code"
    		self.arg_help = "ICD10 Codes Column name in Source File"
    		self.db = pd.read_parquet("maps/processed/icd10_code.parquet")
    		self.checks = [
    			(
    				"Not Empty",
    				lambda codes : pd.Series([len(codes) > 0]),
    				lambda codes : raise_(Exception("Code List is Empty"))
    			),
    			(
    				"Too Short",
    				lambda codes : ~(codes.str.len() < 3),
    				lambda codes : log_invalid_code(codes,
    												~(codes.str.len() < 3), #Log non-matching rows
    												code_type="icd10_code",
    												file_path=self.file_path, 
    												cause="QA Too Short"),
    			),
    			(
    				"Has Dot",
    				lambda codes : ~(codes.str.contains('\.')), #check if contrains dot
    				lambda codes : codes.str.replace(".", "") #delete any dots in string
    				# lambda codes : codes.str.split('\.').apply(lambda ls: ls[0]) #only get part before dot
    			),
    			(
    				"Alphanumeric Capital",
    				lambda codes : codes.str.match("^[A-Z0-9]+$"),
    				lambda codes : log_invalid_code(codes,
    												codes.str.match("^[A-Z0-9]+$"), #Log non-matching rows
    												code_type="icd10_code",
    												file_path=self.file_path, 
    											    cause="QA Alphanumeric Capital"),
    			),
    			(
    				"In Database",
    				lambda codes : ~(~in_database(codes, self.db, "icd10_code") & ~in_database(codes, self.db, "icd10_alt_code")),
    				lambda codes : log_invalid_code(codes, #Log non-matching rows
    												~(~in_database(codes, self.db, "icd10_code") & ~in_database(codes, self.db, "icd10_alt_code")), 
    												code_type="icd10_code",
    												file_path=self.file_path,
    												cause="QA In Database"),
    			)
    # 			(
    # 				"ICD10 Regex",
    # 				lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum*
    # 				lambda codes : log_invalid_code(codes,
    # 												codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows
    # 												code_type="icd10_code",
    # 												file_path=self.file_path),
    
    # 			)
    		]
    		
    	def trim_icd10(codes):
    		codes = codes.str[:4]
    		return codes
    		
    	
    class Snomed_code(Proto_code):
    	def __init__(self, file_path=None):
    		super().__init__(file_path)
    		self.arg_small = "-s"
    		self.arg_long = "--snomed-code"
    		self.arg_help = "SNOMED Codes Column name in Source File"
    		self.db = pd.read_parquet("maps/processed/snomed_code.parquet")
    		self.checks = [
    			# (
    			# 	"Not Empty",
    			# 	lambda codes : pd.Series([len(codes) > 0]),
    			# 	lambda codes : raise_(Exception("Code List is Empty"))
    			# ),
    			(
    				"Too Short",
    				lambda codes : ~(codes.str.len() < 6),
    				lambda codes : log_invalid_code(codes, #Log non-matching rows
    												~(codes.str.len() < 6), 
    												code_type="snomed_code",
    												file_path=self.file_path,
    											    cause="QA Too Short"),
    			),
    			(
    				"Too Long",
    				lambda codes : ~(codes.str.len() > 18),
    				lambda codes : log_invalid_code(codes, #Log non-matching rows
    												~(codes.str.len() > 18), 
    												code_type="snomed_code",
    												file_path=self.file_path,
    											    cause="QA Too Long"),
    			),
    			(
    				"Numeric",
    				lambda codes : codes.str.match("[0-9]+$"),
    				lambda codes : log_invalid_code(codes, #Log non-matching rows
    												codes.str.match("[0-9]+$"),
    												code_type="snomed_code",
    												file_path=self.file_path,
    											    cause="QA Numeric"),
    			),
    			# (
    			# 	"Is Integer",
    			# 	lambda codes : codes.dtype == int,
    			# 	lambda codes : codes.astype(int) #Convert to integer
    			# ),
    			(
    				"In Database",
    				lambda codes : in_database(codes, self.db, "snomed_code"),
    				lambda codes : log_invalid_code(codes, #Log non-matching rows
    												in_database(codes, self.db, "snomed_code"), 
    												code_type="snomed_code",
    												file_path=self.file_path,
    											    cause="QA In Database"),
    			)
    		]
    
    class Opcs4_code(Proto_code):
    	def __init__(self, file_path=None):
    		super().__init__(file_path)
    		self.arg_small = "-o"
    		self.arg_long = "--opcs4-code"
    		self.arg_help = "OPCS4 Codes Column name in Source File"
    		self.db = pd.read_parquet("maps/processed/opcs4_code.parquet")
    		self.checks = [
    			(
    				"Not Empty",
    				lambda codes : pd.Series([len(codes) > 0]),
    				lambda codes : raise_(Exception("Code List is Empty"))
    			),
    			(
    				"In Database",
    				lambda codes : in_database(codes, self.db, "opcs4_code"),
    				lambda codes : log_invalid_code(codes, #Log non-matching rows
    												in_database(codes, self.db, "opcs4_code"), 
    												code_type="opcs4_code",
    												file_path=self.file_path,
    											    cause="QA In Database"),
    			)
    		]
    
    class Atc_code(Proto_code):
    	def __init__(self, file_path=None):
    		super().__init__(file_path)
    		self.arg_small = "-a"
    		self.arg_long = "--atc-code"
    		self.arg_help = "ATC Codes Column name in Source File"
    		self.checks = [
    			(
    				"Not Empty",
    				lambda codes : pd.Series([len(codes) > 0]),
    				lambda codes : raise_(Exception("Code List is Empty"))
    			),
    			(
    				"Alphanumeric Capital",
    				lambda codes : codes.str.match("^[A-Z0-9]+$"),
    				lambda codes : log_invalid_code(codes, #Log non-matching rows
    												codes.str.match("^[A-Z0-9]+$"), 
    												code_type="atc_code",
    												file_path=self.file_path,
    											    cause="QA Alphanumeric Capital"),
    			),
    		]
    		
    class Med_code(Proto_code):
    	def __init__(self, file_path=None):
    		super().__init__(file_path)
    		self.arg_small = "-m"
    		self.arg_long = "--med-code"
    		self.arg_help = "Med Codes Column name in Source File"
    		self.checks = [
    			(
    				"Not Empty",
    				lambda codes : pd.Series([len(codes) > 0]),
    				lambda codes : raise_(Exception("Code List is Empty"))
    			)
    		]
    		
    class Cprd_code(Proto_code):
    	def __init__(self, file_path=None):
    		super().__init__(file_path)
    		self.arg_small = "-c"
    		self.arg_long = "--cprd-code"
    		self.arg_help = "CPRD Product Codes Column name in Source File"
    		self.checks = [
    			(
    				"Not Empty",
    				lambda codes : pd.Series([len(codes) > 0]),
    				lambda codes : raise_(Exception("Code List is Empty"))
    			)
    		]
    		
    code_types = {
    	"read2_code": Read2_code,
    	"read3_code": Read3_code,
    	"icd10_code": Icd10_code,
    	"snomed_code": Snomed_code,
    	"opcs4_code": Opcs4_code,
    	"atc_code": Atc_code,
    	"med_code": Med_code,
    	"cprd_code": Cprd_code,
    }