Skip to content
Snippets Groups Projects
Select Git revision
  • 945ff7caa9c45bbe69b1478ec02cd25cc370e8b3
  • dev default
  • 64-feat-blacklist-unwanted-concepts-from-output
  • 61-feature-add-optional-backwards-mapping-for-consistency-with-older-version-2
  • main protected
  • 11-test-fix-tests-to-handle-licensed-data-resources-from-trud-snd-omop
  • general
  • pypi
  • old-main
  • v0.0.3
10 results

parse.py

Blame
  • parse.py 11.28 KiB
    import pandas as pd
    import numpy as np
    import os
    import trud
    
    from base import log_invalid_code
    from base import bcolors
    from base import raise_
    		
    def in_database(codes, db, col):
        return codes.isin(db[col])
    
    class Proto_code():
    	"""
    	Define checks as list of 3 tuple: (Message, Condition, Process)
    	- Message = The name of the condition (what is printed and logged)
    	- Condition = True if Passed, and False if Failed
    	- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
    	"""
    	checks = [
    		(
    			"Not Empty",
    			lambda codes : pd.Series([len(codes) > 0]), #Should be true if passed
    			lambda codes : raise_(Exception("Code List is Empty")) #correts code, or logs, or throws error
    		)
    	]
    	
    	def __init__(self, file_path=None):
    		self.file_path = file_path
    	
    	def process(self, codes):
    		"""
    		identify issues that do not pass and fix them with defined process
    		"""
    		# codes = codes.dropna()
    		for msg, cond, process in self.checks: #run each check
    			if not cond(codes).all(): #if test failed
    				# print("Check:", msg, bcolors.FAIL+"FAILED"+bcolors.ENDC)
    				print("Check: ", msg, bcolors.FAIL+f"{(~cond(codes)).sum()} FAILED"+bcolors.ENDC,)
    				codes = process(codes) #run process to fix issue
    				if cond(codes).all(): #is resloved by process
    					print("Check:", msg, "is resolved")
    				else: #not resolved by process
    					raise Exception(f"ERROR: Check {msg} is NOT resolved")
    			else:
    				print("Check:", msg, bcolors.OKGREEN+"PASSED"+bcolors.ENDC)
    		return codes
    	
    	def verify(self, codes): 
    		"""
    		verify all identified issues pass
    		"""
    		conds = np.array([])
    		for msg, cond, process in self.checks: #run each check
    			out = cond(codes)
    			conds = np.append(conds, out.all())
    			if not out.all(): #if test failed
    				print("Verify:", msg, bcolors.FAIL+"FAILED"+bcolors.ENDC)
    				print(codes[out]) #show failed codes
    				
    		if conds.all(): #check all have passed
    			print(f"Verify: {bcolors.OKGREEN}ALL PASSED{bcolors.ENDC}")
    			return True
    		else: #not all have passed
    			print("Verify: ", bcolors.FAIL, (len(conds) - conds.sum()), " FAILED", bcolors.ENDC)
    			return False
    	
    class Read2_code(Proto_code):
        def __init__(self, file_path=None):
            super().__init__(file_path)
    		
            input_path = trud.MAPS_PROCESSED_DIR / 'read2_code.parquet'
            if not input_path.is_file():  
                raise FileNotFoundError(f"Error: Read2 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")   
            self.db = pd.read_parquet(input_path)
    		
            self.checks = [
    			(
    				"Not Empty",
    				lambda codes : pd.Series([len(codes) > 0]),
    				lambda codes : raise_(Exception("Code List is Empty"))
    			),
    			(
    				"Too Short",
    				lambda codes : ~(codes.str.len() < 5),
    				lambda codes : codes.str.pad(width=5, side='right', fillchar='.')
    			),
    			(
    				"Too Long",
    				lambda codes : ~(codes.str.len() > 5),
    				lambda codes : codes.str[:5]
    			),
    			(
    				"Alphanumeric Dot",
    				lambda codes : codes.str.match("^[a-zA-Z0-9\.]+$"),
    				lambda codes : log_invalid_code(codes,
    												codes.str.match("^[a-zA-Z0-9\.]+$"), #Log non-matching rows
    												code_type="read2_code",
    												file_path=self.file_path,
    												cause="QA Alphanumeric Dot"),
    			),
    			(
    				"In Database",
    				lambda codes : in_database(codes, self.db, "read2_code"),
    				lambda codes : log_invalid_code(codes,
    												in_database(codes, self.db, "read2_code"), #Log non-matching rows
    												code_type="read2_code",
    												file_path=self.file_path,
    												cause="QA In Database"),
    			),
    
    		]
    	
    class Read3_code(Proto_code):
    	def __init__(self, file_path=None):
    		super().__init__(file_path)
    
    		input_path = trud.MAPS_PROCESSED_DIR / 'read3_code.parquet'
    		if not input_path.is_file():  
    			raise FileNotFoundError(f"Error: Read3 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")               
    		self.db = pd.read_parquet(input_path)
                
    		self.checks = [
    			(
    				"Not Empty",
    				lambda codes : pd.Series([len(codes) > 0]),
    				lambda codes : raise_(Exception("Code List is Empty"))
    			),
    			(
    				"Too Short",
    				lambda codes : ~(codes.str.len() < 5),
    				lambda codes : codes.str.pad(width=5, side='right', fillchar='.')
    			),
    			(
    				"Too Long",
    				lambda codes : ~(codes.str.len() > 5),
    				lambda codes : codes.str[:5]
    			),
    			(
    				"Alphanumeric Dot",
    				lambda codes : codes.str.match("^[a-zA-Z0-9\.]+$"),
    				lambda codes : log_invalid_code(codes,
    												codes.str.match("^[a-zA-Z0-9\.]+$"), #Log non-matching rows
    												code_type="read3_code",
    												file_path=self.file_path, 
    												cause="QA Alphanumeric Dot"),
    			),
    			(
    				"In Database",
    				lambda codes : in_database(codes, self.db, "read3_code"),
    				lambda codes : log_invalid_code(codes,
    												in_database(codes, self.db, "read3_code"), #Log non-matching rows
    												code_type="read3_code",
    												file_path=self.file_path,
    											    cause="QA In Database"),
    			),
    		]
    	
    class Icd10_code(Proto_code):
    	def __init__(self, file_path=None):
    		super().__init__(file_path)
    
    		input_path = trud.MAPS_PROCESSED_DIR / 'icd10_code.parquet'
    		if not input_path.is_file():  
    			raise FileNotFoundError(f"Error: ICD10 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")               
    		self.db = pd.read_parquet(input_path)
    		self.checks = [
    			(
    				"Not Empty",
    				lambda codes : pd.Series([len(codes) > 0]),
    				lambda codes : raise_(Exception("Code List is Empty"))
    			),
    			(
    				"Too Short",
    				lambda codes : ~(codes.str.len() < 3),
    				lambda codes : log_invalid_code(codes,
    												~(codes.str.len() < 3), #Log non-matching rows
    												code_type="icd10_code",
    												file_path=self.file_path, 
    												cause="QA Too Short"),
    			),
    			(
    				"Has Dot",
    				lambda codes : ~(codes.str.contains('\.')), #check if contrains dot
    				lambda codes : codes.str.replace(".", "") #delete any dots in string
    				# lambda codes : codes.str.split('\.').apply(lambda ls: ls[0]) #only get part before dot
    			),
    			(
    				"Alphanumeric Capital",
    				lambda codes : codes.str.match("^[A-Z0-9]+$"),
    				lambda codes : log_invalid_code(codes,
    												codes.str.match("^[A-Z0-9]+$"), #Log non-matching rows
    												code_type="icd10_code",
    												file_path=self.file_path, 
    											    cause="QA Alphanumeric Capital"),
    			),
    			(
    				"In Database",
    				lambda codes : ~(~in_database(codes, self.db, "icd10_code") & ~in_database(codes, self.db, "icd10_alt_code")),
    				lambda codes : log_invalid_code(codes, #Log non-matching rows
    												~(~in_database(codes, self.db, "icd10_code") & ~in_database(codes, self.db, "icd10_alt_code")), 
    												code_type="icd10_code",
    												file_path=self.file_path,
    												cause="QA In Database"),
    			)
    # 			(
    # 				"ICD10 Regex",
    # 				lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum*
    # 				lambda codes : log_invalid_code(codes,
    # 												codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows
    # 												code_type="icd10_code",
    # 												file_path=self.file_path),
    
    # 			)
    		]
    		
    	def trim_icd10(codes):
    		codes = codes.str[:4]
    		return codes
    		
    	
    class Snomed_code(Proto_code):
    	def __init__(self, file_path=None):
    		super().__init__(file_path)
    
    		input_path = trud.MAPS_PROCESSED_DIR / 'snomed_code.parquet'
    		if not input_path.is_file():  
    			raise FileNotFoundError(f"Error: SNOMED code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")               
    		self.db = pd.read_parquet(input_path)        
    		self.checks = [
    			# (
    			# 	"Not Empty",
    			# 	lambda codes : pd.Series([len(codes) > 0]),
    			# 	lambda codes : raise_(Exception("Code List is Empty"))
    			# ),
    			(
    				"Too Short",
    				lambda codes : ~(codes.str.len() < 6),
    				lambda codes : log_invalid_code(codes, #Log non-matching rows
    												~(codes.str.len() < 6), 
    												code_type="snomed_code",
    												file_path=self.file_path,
    											    cause="QA Too Short"),
    			),
    			(
    				"Too Long",
    				lambda codes : ~(codes.str.len() > 18),
    				lambda codes : log_invalid_code(codes, #Log non-matching rows
    												~(codes.str.len() > 18), 
    												code_type="snomed_code",
    												file_path=self.file_path,
    											    cause="QA Too Long"),
    			),
    			(
    				"Numeric",
    				lambda codes : codes.str.match("[0-9]+$"),
    				lambda codes : log_invalid_code(codes, #Log non-matching rows
    												codes.str.match("[0-9]+$"),
    												code_type="snomed_code",
    												file_path=self.file_path,
    											    cause="QA Numeric"),
    			),
    			# (
    			# 	"Is Integer",
    			# 	lambda codes : codes.dtype == int,
    			# 	lambda codes : codes.astype(int) #Convert to integer
    			# ),
    			(
    				"In Database",
    				lambda codes : in_database(codes, self.db, "snomed_code"),
    				lambda codes : log_invalid_code(codes, #Log non-matching rows
    												in_database(codes, self.db, "snomed_code"), 
    												code_type="snomed_code",
    												file_path=self.file_path,
    											    cause="QA In Database"),
    			)
    		]
    
    class Opcs4_code(Proto_code):
    	def __init__(self, file_path=None):
    		super().__init__(file_path)
    
    		input_path = trud.MAPS_PROCESSED_DIR / 'opcs4_code.parquet'
    		if not input_path.is_file():  
    			raise FileNotFoundError(f"Error: OPCS4 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")               
    		self.db = pd.read_parquet(input_path)          
    		self.checks = [
    			(
    				"Not Empty",
    				lambda codes : pd.Series([len(codes) > 0]),
    				lambda codes : raise_(Exception("Code List is Empty"))
    			),
    			(
    				"In Database",
    				lambda codes : in_database(codes, self.db, "opcs4_code"),
    				lambda codes : log_invalid_code(codes, #Log non-matching rows
    												in_database(codes, self.db, "opcs4_code"), 
    												code_type="opcs4_code",
    												file_path=self.file_path,
    											    cause="QA In Database"),
    			)
    		]
    
    class Atc_code(Proto_code):
    	def __init__(self, file_path=None):
    		super().__init__(file_path)
    		self.checks = [
    			(
    				"Not Empty",
    				lambda codes : pd.Series([len(codes) > 0]),
    				lambda codes : raise_(Exception("Code List is Empty"))
    			),
    			(
    				"Alphanumeric Capital",
    				lambda codes : codes.str.match("^[A-Z0-9]+$"),
    				lambda codes : log_invalid_code(codes, #Log non-matching rows
    												codes.str.match("^[A-Z0-9]+$"), 
    												code_type="atc_code",
    												file_path=self.file_path,
    											    cause="QA Alphanumeric Capital"),
    			),
    		]
    		
    class Med_code(Proto_code):
    	def __init__(self, file_path=None):
    		super().__init__(file_path)
    		self.checks = [
    			(
    				"Not Empty",
    				lambda codes : pd.Series([len(codes) > 0]),
    				lambda codes : raise_(Exception("Code List is Empty"))
    			)
    		]
    		
    class Cprd_code(Proto_code):
    	def __init__(self, file_path=None):
    		super().__init__(file_path)
    		self.checks = [
    			(
    				"Not Empty",
    				lambda codes : pd.Series([len(codes) > 0]),
    				lambda codes : raise_(Exception("Code List is Empty"))
    			)
    		]
    		
    code_types = {
    	"read2": Read2_code,
    	"read3": Read3_code,
    	"icd10": Icd10_code,
    	"snomed": Snomed_code,
    	"opcs4": Opcs4_code,
    	"atc": Atc_code,
    	"med": Med_code,
    	"cprd": Cprd_code,
    }
    
    vocab_types = {
    	"read2_code": "Read",
    	"read3_code": None,
    	"icd10_code": "ICD10CM",
    	"snomed_code": "SNOMED",
    	"opcs4_code": "OPCS4",
    	"atc_code": "ATC",
    	"med_code": None,
    	"cprd_code": None,
    }