Select Git revision
Controller.java
parse.py 10.89 KiB
import pandas as pd
import numpy as np
import os
from base import log_invalid_code
from base import bcolors
from base import raise_
def in_database(codes, db, col):
return codes.isin(db[col])
class Proto_code():
"""
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
"""
checks = [
(
"Not Empty",
lambda codes : pd.Series([len(codes) > 0]), #Should be true if passed
lambda codes : raise_(Exception("Code List is Empty")) #correts code, or logs, or throws error
)
]
def __init__(self, file_path=None):
self.file_path = file_path
def process(self, codes):
"""
identify issues that do not pass and fix them with defined process
"""
# codes = codes.dropna()
for msg, cond, process in self.checks: #run each check
if not cond(codes).all(): #if test failed
# print("Check:", msg, bcolors.FAIL+"FAILED"+bcolors.ENDC)
print("Check: ", msg, bcolors.FAIL+f"{(~cond(codes)).sum()} FAILED"+bcolors.ENDC,)
codes = process(codes) #run process to fix issue
if cond(codes).all(): #is resloved by process
print("Check:", msg, "is resolved")
else: #not resolved by process
raise Exception(f"ERROR: Check {msg} is NOT resolved")
else:
print("Check:", msg, bcolors.OKGREEN+"PASSED"+bcolors.ENDC)
return codes
def verify(self, codes):
"""
verify all identified issues pass
"""
conds = np.array([])
for msg, cond, process in self.checks: #run each check
out = cond(codes)
conds = np.append(conds, out.all())
if not out.all(): #if test failed
print("Verify:", msg, bcolors.FAIL+"FAILED"+bcolors.ENDC)
print(codes[out]) #show failed codes
if conds.all(): #check all have passed
print(f"Verify: {bcolors.OKGREEN}ALL PASSED{bcolors.ENDC}")
return True
else: #not all have passed
print("Verify: ", bcolors.FAIL, (len(conds) - conds.sum()), " FAILED", bcolors.ENDC)
return False
class Read2_code(Proto_code):
def __init__(self, file_path=None):
super().__init__(file_path)
self.db = pd.read_parquet("maps/processed/read2_code.parquet")
self.arg_small = "-r2"
self.arg_long = "--read2-code"
self.arg_help = "Read V2 Codes Column name in Source File"
self.checks = [
(
"Not Empty",
lambda codes : pd.Series([len(codes) > 0]),
lambda codes : raise_(Exception("Code List is Empty"))
),
(
"Too Short",
lambda codes : ~(codes.str.len() < 5),
lambda codes : codes.str.pad(width=5, side='right', fillchar='.')
),
(
"Too Long",
lambda codes : ~(codes.str.len() > 5),
lambda codes : codes.str[:5]
),
(
"Alphanumeric Dot",
lambda codes : codes.str.match("^[a-zA-Z0-9\.]+$"),
lambda codes : log_invalid_code(codes,
codes.str.match("^[a-zA-Z0-9\.]+$"), #Log non-matching rows
code_type="read2_code",
file_path=self.file_path,
cause="QA Alphanumeric Dot"),
),
(
"In Database",
lambda codes : in_database(codes, self.db, "read2_code"),
lambda codes : log_invalid_code(codes,
in_database(codes, self.db, "read2_code"), #Log non-matching rows
code_type="read2_code",
file_path=self.file_path,
cause="QA In Database"),
),
]
class Read3_code(Proto_code):
def __init__(self, file_path=None):
super().__init__(file_path)
self.arg_small = "-r3"
self.arg_long = "--read3-code"
self.arg_help = "Read V3 Codes Column name in Source File"
self.db = pd.read_parquet("maps/processed/read3_code.parquet")
self.checks = [
(
"Not Empty",
lambda codes : pd.Series([len(codes) > 0]),
lambda codes : raise_(Exception("Code List is Empty"))
),
(
"Too Short",
lambda codes : ~(codes.str.len() < 5),
lambda codes : codes.str.pad(width=5, side='right', fillchar='.')
),
(
"Too Long",
lambda codes : ~(codes.str.len() > 5),
lambda codes : codes.str[:5]
),
(
"Alphanumeric Dot",
lambda codes : codes.str.match("^[a-zA-Z0-9\.]+$"),
lambda codes : log_invalid_code(codes,
codes.str.match("^[a-zA-Z0-9\.]+$"), #Log non-matching rows
code_type="read3_code",
file_path=self.file_path,
cause="QA Alphanumeric Dot"),
),
(
"In Database",
lambda codes : in_database(codes, self.db, "read3_code"),
lambda codes : log_invalid_code(codes,
in_database(codes, self.db, "read3_code"), #Log non-matching rows
code_type="read3_code",
file_path=self.file_path,
cause="QA In Database"),
),
]
class Icd10_code(Proto_code):
def __init__(self, file_path=None):
super().__init__(file_path)
self.arg_small = "-i"
self.arg_long = "--icd10-code"
self.arg_help = "ICD10 Codes Column name in Source File"
self.db = pd.read_parquet("maps/processed/icd10_code.parquet")
self.checks = [
(
"Not Empty",
lambda codes : pd.Series([len(codes) > 0]),
lambda codes : raise_(Exception("Code List is Empty"))
),
(
"Too Short",
lambda codes : ~(codes.str.len() < 3),
lambda codes : log_invalid_code(codes,
~(codes.str.len() < 3), #Log non-matching rows
code_type="icd10_code",
file_path=self.file_path,
cause="QA Too Short"),
),
(
"Has Dot",
lambda codes : ~(codes.str.contains('\.')), #check if contrains dot
lambda codes : codes.str.replace(".", "") #delete any dots in string
# lambda codes : codes.str.split('\.').apply(lambda ls: ls[0]) #only get part before dot
),
(
"Alphanumeric Capital",
lambda codes : codes.str.match("^[A-Z0-9]+$"),
lambda codes : log_invalid_code(codes,
codes.str.match("^[A-Z0-9]+$"), #Log non-matching rows
code_type="icd10_code",
file_path=self.file_path,
cause="QA Alphanumeric Capital"),
),
(
"In Database",
lambda codes : ~(~in_database(codes, self.db, "icd10_code") & ~in_database(codes, self.db, "icd10_alt_code")),
lambda codes : log_invalid_code(codes, #Log non-matching rows
~(~in_database(codes, self.db, "icd10_code") & ~in_database(codes, self.db, "icd10_alt_code")),
code_type="icd10_code",
file_path=self.file_path,
cause="QA In Database"),
)
# (
# "ICD10 Regex",
# lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum*
# lambda codes : log_invalid_code(codes,
# codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows
# code_type="icd10_code",
# file_path=self.file_path),
# )
]
def trim_icd10(codes):
codes = codes.str[:4]
return codes
class Snomed_code(Proto_code):
def __init__(self, file_path=None):
super().__init__(file_path)
self.arg_small = "-s"
self.arg_long = "--snomed-code"
self.arg_help = "SNOMED Codes Column name in Source File"
self.db = pd.read_parquet("maps/processed/snomed_code.parquet")
self.checks = [
# (
# "Not Empty",
# lambda codes : pd.Series([len(codes) > 0]),
# lambda codes : raise_(Exception("Code List is Empty"))
# ),
(
"Too Short",
lambda codes : ~(codes.str.len() < 6),
lambda codes : log_invalid_code(codes, #Log non-matching rows
~(codes.str.len() < 6),
code_type="snomed_code",
file_path=self.file_path,
cause="QA Too Short"),
),
(
"Too Long",
lambda codes : ~(codes.str.len() > 18),
lambda codes : log_invalid_code(codes, #Log non-matching rows
~(codes.str.len() > 18),
code_type="snomed_code",
file_path=self.file_path,
cause="QA Too Long"),
),
(
"Numeric",
lambda codes : codes.str.match("[0-9]+$"),
lambda codes : log_invalid_code(codes, #Log non-matching rows
codes.str.match("[0-9]+$"),
code_type="snomed_code",
file_path=self.file_path,
cause="QA Numeric"),
),
# (
# "Is Integer",
# lambda codes : codes.dtype == int,
# lambda codes : codes.astype(int) #Convert to integer
# ),
(
"In Database",
lambda codes : in_database(codes, self.db, "snomed_code"),
lambda codes : log_invalid_code(codes, #Log non-matching rows
in_database(codes, self.db, "snomed_code"),
code_type="snomed_code",
file_path=self.file_path,
cause="QA In Database"),
)
]
class Opcs4_code(Proto_code):
def __init__(self, file_path=None):
super().__init__(file_path)
self.arg_small = "-o"
self.arg_long = "--opcs4-code"
self.arg_help = "OPCS4 Codes Column name in Source File"
self.db = pd.read_parquet("maps/processed/opcs4_code.parquet")
self.checks = [
(
"Not Empty",
lambda codes : pd.Series([len(codes) > 0]),
lambda codes : raise_(Exception("Code List is Empty"))
),
(
"In Database",
lambda codes : in_database(codes, self.db, "opcs4_code"),
lambda codes : log_invalid_code(codes, #Log non-matching rows
in_database(codes, self.db, "opcs4_code"),
code_type="opcs4_code",
file_path=self.file_path,
cause="QA In Database"),
)
]
class Atc_code(Proto_code):
def __init__(self, file_path=None):
super().__init__(file_path)
self.arg_small = "-a"
self.arg_long = "--atc-code"
self.arg_help = "ATC Codes Column name in Source File"
self.checks = [
(
"Not Empty",
lambda codes : pd.Series([len(codes) > 0]),
lambda codes : raise_(Exception("Code List is Empty"))
),
(
"Alphanumeric Capital",
lambda codes : codes.str.match("^[A-Z0-9]+$"),
lambda codes : log_invalid_code(codes, #Log non-matching rows
codes.str.match("^[A-Z0-9]+$"),
code_type="atc_code",
file_path=self.file_path,
cause="QA Alphanumeric Capital"),
),
]
class Med_code(Proto_code):
def __init__(self, file_path=None):
super().__init__(file_path)
self.arg_small = "-m"
self.arg_long = "--med-code"
self.arg_help = "Med Codes Column name in Source File"
self.checks = [
(
"Not Empty",
lambda codes : pd.Series([len(codes) > 0]),
lambda codes : raise_(Exception("Code List is Empty"))
)
]
class Cprd_code(Proto_code):
def __init__(self, file_path=None):
super().__init__(file_path)
self.arg_small = "-c"
self.arg_long = "--cprd-code"
self.arg_help = "CPRD Product Codes Column name in Source File"
self.checks = [
(
"Not Empty",
lambda codes : pd.Series([len(codes) > 0]),
lambda codes : raise_(Exception("Code List is Empty"))
)
]
code_types = {
"read2_code": Read2_code,
"read3_code": Read3_code,
"icd10_code": Icd10_code,
"snomed_code": Snomed_code,
"opcs4_code": Opcs4_code,
"atc_code": Atc_code,
"med_code": Med_code,
"cprd_code": Cprd_code,
}