acmc.parse
1import pandas as pd 2import numpy as np 3import os 4from typing import Callable, Optional, Tuple 5from pathlib import Path 6 7from acmc import trud, logging_config as lc 8 9# setup logging 10logger = lc.setup_logger() 11 12# Define allowed values 13SUPPORTED_CODE_TYPES = {"read2", "read3", "icd10", "snomed", "opcs4", "atc"} 14 15 16class CodesError: 17 """A class used in InvalidCodesException to report an error if a code parser check fails""" 18 19 def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None): 20 # initialise class variables with provided parameters 21 for key, value in locals().items(): 22 if key != "self": 23 setattr(self, key, value) 24 25 26class InvalidCodesException(Exception): 27 """Custom exception class raised when invalid codes are found that cannot be resolved by processing""" 28 29 def __init__(self, error): 30 super().__init__(error.message) 31 self.error = error 32 33 34class Proto: 35 """ 36 Define checks as list of 3 tuple: (Message, Condition, Process) 37 - Message = The name of the condition (what is printed and logged) 38 - Condition = True if Passed, and False if Failed 39 - Process = Aims to resolve all issues that stop condition from passing (Do not change index!) 40 """ 41 42 checks: list[ 43 tuple[ 44 str, # The description, e.g., "Not Empty" 45 Callable[ 46 [list], 47 pd.Series, 48 ], # The first lambda function: takes a list and returns a pd.Series of booleans 49 Callable[ 50 [list, Path], 51 None, 52 ], # The second lambda function: takes a list and a string, and returns nothing 53 ] 54 ] 55 56 def __init__(self, name: str, trud_codes_path: Optional[Path] = None): 57 if trud_codes_path is not None: 58 if trud_codes_path.is_file(): 59 self.trud_codes_path: Path = trud_codes_path 60 self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path) 61 else: 62 raise FileNotFoundError( 63 f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly" 64 ) 65 66 self.name: str = name 67 68 def raise_exception(self, ex: Exception): 69 """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict""" 70 raise ex 71 72 def in_database( 73 self, codes: pd.DataFrame, db: pd.DataFrame, col: str 74 ) -> pd.DataFrame: 75 return codes.isin(db[col]) 76 77 def process( 78 self, codes: pd.DataFrame, codes_file: Path 79 ) -> Tuple[pd.DataFrame, list]: 80 """identify issues that do not pass and fix them with define/d process""" 81 errors = [] 82 # Iter through each item in check. 83 for msg, cond, fix in self.checks: 84 # Check if any codes fail the check to False 85 if not cond(codes).all(): 86 # Log the number of codes that failed 87 logger.debug( 88 f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix" 89 ) 90 # try fix errors by running lamba "process" function 91 try: 92 codes = fix(codes, codes_file) 93 logger.debug(f"Check: Fixed") 94 except InvalidCodesException as ex: 95 errors.append(ex.error) 96 else: 97 logger.debug(f"Check: passed") 98 99 return codes, errors 100 101 def verify(self, codes: pd.DataFrame, codes_file: Path): 102 """verify codes in codes file""" 103 conds = np.array([]) 104 105 # Iter through each item in check. 106 for msg, cond, process in self.checks: 107 # run conditional check 108 out = cond(codes) 109 conds = np.append(conds, out.all()) 110 111 return conds 112 113 114class Read2(Proto): 115 """This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary.""" 116 117 def __init__(self): 118 super().__init__("read2", trud.PROCESSED_PATH / "read2.parquet") 119 120 # validate checks 121 self.checks = [ 122 ( 123 # check codes are not empty, if empty throw an exception 124 "Not Empty", 125 lambda codes: pd.Series([len(codes) > 0]), 126 lambda codes, codes_file: self.raise_exception( 127 InvalidCodesException( 128 CodesError( 129 f"Code list is empty", 130 codes=codes, 131 codes_file=codes_file, 132 mask=None, 133 code_type=self.name, 134 ) 135 ) 136 ), 137 ), 138 ( 139 # check codes <5 characters, if too short pads it with . (dots) to reach 5 characters 140 "Too Short", 141 lambda codes: ~(codes.str.len() < 5), 142 lambda codes, codes_file: codes.str.pad( 143 width=5, side="right", fillchar="." 144 ), 145 ), 146 ( 147 # check codes > 5 characters, If too long, truncates them to 5 characters 148 "Too Long", 149 lambda codes: ~(codes.str.len() > 5), 150 lambda codes, codes_file: codes.str[:5], 151 ), 152 ( 153 # checks codes contain numbers, or dots (.), if not logs invalid code error 154 "Alphanumeric Dot", 155 lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"), 156 lambda codes, codes_file: self.raise_exception( 157 InvalidCodesException( 158 CodesError( 159 f"Illegal code format, not alphanumeric dot", 160 codes=codes, 161 codes_file=codes_file, 162 mask=codes.str.match(r"^[a-zA-Z0-9.]+$"), 163 code_type=self.name, 164 ) 165 ) 166 ), 167 ), 168 ( 169 # checks code exists in self.db (the Read2 dataset). If missing log invalid codes. 170 "In Database", 171 lambda codes: self.in_database(codes, self.db, self.name), 172 lambda codes, codes_file: self.raise_exception( 173 InvalidCodesException( 174 CodesError( 175 f"Codes do not exist in database", 176 codes=codes, 177 codes_file=codes_file, 178 mask=self.in_database(codes, self.db, self.name), 179 code_type=self.name, 180 ) 181 ) 182 ), 183 ), 184 ] 185 186 187class Read3(Proto): 188 def __init__(self): 189 super().__init__("Read3", trud.PROCESSED_PATH / "read3.parquet") 190 191 self.checks = [ 192 ( 193 "Not Empty", 194 lambda codes: pd.Series([len(codes) > 0]), 195 lambda codes, codes_file: self.raise_exception( 196 InvalidCodesException( 197 CodesError( 198 f"Code list is empty", 199 codes=codes, 200 codes_file=codes_file, 201 mask=None, 202 code_type=self.name, 203 ) 204 ) 205 ), 206 ), 207 ( 208 "Too Short", 209 lambda codes: ~(codes.str.len() < 5), 210 lambda codes, codes_file: codes.str.pad( 211 width=5, side="right", fillchar="." 212 ), 213 ), 214 ( 215 "Too Long", 216 lambda codes: ~(codes.str.len() > 5), 217 lambda codes, codes_file: codes.str[:5], 218 ), 219 ( 220 "Alphanumeric Dot", 221 lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"), 222 lambda codes, codes_file: self.raise_exception( 223 InvalidCodesException( 224 CodesError( 225 f"QA Alphanumeric Dot", 226 codes=codes, 227 codes_file=codes_file, 228 check_regex=codes.str.match(r"^[a-zA-Z0-9.]+$"), 229 code_type=self.name, 230 ) 231 ) 232 ), 233 ), 234 ( 235 "In Database", 236 lambda codes: self.in_database(codes, self.db, self.name), 237 lambda codes, codes_file: self.raise_exception( 238 InvalidCodesException( 239 CodesError( 240 f"QA In Database", 241 codes=codes, 242 codes_file=codes_file, 243 check_regex=self.in_database(codes, self.db, self.name), 244 code_type=self.name, 245 ) 246 ) 247 ), 248 ), 249 ] 250 251 252class Icd10(Proto): 253 def __init__(self): 254 super().__init__("icd10", trud.PROCESSED_PATH / "icd10.parquet") 255 256 self.checks = [ 257 ( 258 "Not Empty", 259 lambda codes: pd.Series([len(codes) > 0]), 260 lambda codes, codes_file: self.raise_exception( 261 InvalidCodesException( 262 CodesError( 263 f"Code list is empty {codes_file}", 264 codes=codes, 265 codes_file=codes_file, 266 mask=None, 267 code_type=self.name, 268 ) 269 ) 270 ), 271 ), 272 ( 273 "Too Short", 274 lambda codes: ~(codes.str.len() < 3), 275 lambda codes, codes_file: self.raise_exception( 276 InvalidCodesException( 277 CodesError( 278 f"QA Too Short", 279 codes=codes, 280 codes_file=codes_file, 281 mask=~(codes.str.len() < 3), 282 code_type=self.name, 283 ) 284 ) 285 ), 286 ), 287 ( 288 "Has Dot", 289 lambda codes: ~(codes.str.match(r".*\..*")), # check if contains dot 290 lambda codes, codes_file: codes.str.replace( 291 ".", "" 292 ), # delete any dots in string 293 # lambda codes : codes.str.split('\.').apply(lambda ls: ls[0]) #only get part before dot 294 ), 295 ( 296 "Alphanumeric Capital", 297 lambda codes: codes.str.match(r"^[A-Z0-9]+$"), 298 lambda codes, codes_file: self.raise_exception( 299 InvalidCodesException( 300 CodesError( 301 f"QA Alphanumeric Capital", 302 codes=codes, 303 codes_file=codes_file, 304 mask=codes.str.match(r"^[A-Z0-9]+$"), 305 code_type=self.name, 306 ) 307 ) 308 ), 309 ), 310 ( 311 "In Database", 312 lambda codes: ~( 313 ~self.in_database(codes, self.db, self.name) 314 & ~self.in_database(codes, self.db, self.name + "_alt") 315 ), 316 lambda codes, codes_file: self.raise_exception( 317 InvalidCodesException( 318 CodesError( 319 f"QA In Database", 320 codes=codes, 321 codes_file=codes_file, 322 mask=~( 323 ~self.in_database(codes, self.db, self.name) 324 & ~self.in_database(codes, self.db, self.name + "_alt") 325 ), 326 code_type=self.name, 327 ) 328 ) 329 ), 330 ), 331 # ( 332 # "ICD10 Regex", 333 # lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum* 334 # lambda codes : lc.log_invalid_code(codes, 335 # codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows 336 # code_type="icd10", 337 # 338 # ) 339 ] 340 341 def trim_icd10(codes: pd.DataFrame) -> pd.DataFrame: 342 codes = codes.str[:4] 343 return codes 344 345 346class Snomed(Proto): 347 def __init__(self): 348 super().__init__("snomed", trud.PROCESSED_PATH / "snomed.parquet") 349 350 self.checks = [ 351 # ( 352 # "Not Empty", 353 # lambda codes : pd.Series([len(codes) > 0]), 354 # lambda codes : raise_exception(Exception("Code List is Empty")) 355 # ), 356 ( 357 "Too Short", 358 lambda codes: ~(codes.str.len() < 6), 359 lambda codes, codes_file: self.raise_exception( 360 InvalidCodesException( 361 CodesError( 362 f"QA Too Short", 363 codes=codes, 364 codes_file=codes_file, 365 mask=~(codes.str.len() < 6), 366 code_type=self.name, 367 ) 368 ) 369 ), 370 ), 371 ( 372 "Too Long", 373 lambda codes: ~(codes.str.len() > 18), 374 lambda codes, codes_file: self.raise_exception( 375 InvalidCodesException( 376 CodesError( 377 f"QA Too Long", 378 codes=codes, 379 codes_file=codes_file, 380 mask=~(codes.str.len() > 18), 381 code_type=self.name, 382 ) 383 ) 384 ), 385 ), 386 ( 387 "Numeric", 388 lambda codes: codes.str.match(r"[0-9]+$"), 389 lambda codes, codes_file: self.raise_exception( 390 InvalidCodesException( 391 CodesError( 392 f"QA Numeric", 393 codes=codes, 394 codes_file=codes_file, 395 mask=codes.str.match(r"[0-9]+$"), 396 code_type=self.name, 397 ) 398 ) 399 ), 400 ), 401 # ( 402 # "Is Integer", 403 # lambda codes : codes.dtype == int, 404 # lambda codes : codes.astype(int) #Convert to integer 405 # ), 406 ( 407 "In Database", 408 lambda codes: self.in_database(codes, self.db, self.name), 409 lambda codes, codes_file: self.raise_exception( 410 InvalidCodesException( 411 CodesError( 412 f"QA In Database", 413 codes=codes, 414 codes_file=codes_file, 415 mask=self.in_database(codes, self.db, self.name), 416 code_type=self.name, 417 ) 418 ) 419 ), 420 ), 421 ] 422 423 424class Opcs4(Proto): 425 def __init__(self): 426 super().__init__("opcs4", trud.PROCESSED_PATH / "opcs4.parquet") 427 428 self.checks = [ 429 ( 430 "Not Empty", 431 lambda codes: pd.Series([len(codes) > 0]), 432 lambda codes, codes_file: self.raise_exception( 433 InvalidCodesException( 434 CodesError( 435 f"Code list is empty", 436 codes=codes, 437 codes_file=codes_file, 438 mask=None, 439 code_type=self.name, 440 ) 441 ) 442 ), 443 ), 444 ( 445 "In Database", 446 lambda codes: self.in_database(codes, self.db, self.name), 447 lambda codes, codes_file: self.raise_exception( 448 InvalidCodesException( 449 CodesError( 450 f"QA In Database", 451 codes=codes, 452 codes_file=codes_file, 453 mask=self.in_database(codes, self.db, self.name), 454 code_type=self.name, 455 ) 456 ) 457 ), 458 ), 459 ] 460 461 462class Atc(Proto): 463 def __init__(self): 464 super().__init__("atc", trud_codes_path=None) 465 self.checks = [ 466 ( 467 "Not Empty", 468 lambda codes: pd.Series([len(codes) > 0]), 469 lambda codes, codes_file: self.raise_exception( 470 InvalidCodesException( 471 CodesError( 472 f"Code list is empty", 473 codes=codes, 474 codes_file=codes_file, 475 mask=None, 476 code_type=self.name, 477 ) 478 ) 479 ), 480 ), 481 ( 482 "Alphanumeric Capital", 483 lambda codes: codes.str.match(r"^[A-Z0-9]+$"), 484 lambda codes, codes_file: self.raise_exception( 485 InvalidCodesException( 486 CodesError( 487 f"QA Alphanumeric Capital", 488 codes=codes, 489 codes_file=codes_file, 490 mask=codes.str.match(r"^[A-Z0-9]+$"), 491 code_type=self.name, 492 ) 493 ) 494 ), 495 ), 496 ] 497 498 499class Med(Proto): 500 def __init__(self): 501 super().__init__("med", trud_codes_path=None) 502 self.checks = [ 503 ( 504 "Not Empty", 505 lambda codes: pd.Series([len(codes) > 0]), 506 lambda codes, codes_file: self.raise_exception( 507 InvalidCodesException( 508 CodesError( 509 f"Code list is empty", 510 codes=codes, 511 codes_file=codes_file, 512 mask=None, 513 code_type=self.name, 514 ) 515 ) 516 ), 517 ) 518 ] 519 520 521class Cprd(Proto): 522 def __init__(self): 523 super().__init__("cprd", trud_codes_path=None) 524 self.checks = [ 525 ( 526 "Not Empty", 527 lambda codes: pd.Series([len(codes) > 0]), 528 lambda codes, codes_file: self.raise_exception( 529 InvalidCodesException( 530 CodesError( 531 f"Code list is empty", 532 codes=codes, 533 codes_file=codes_file, 534 mask=None, 535 code_type=self.name, 536 ) 537 ) 538 ), 539 ) 540 ] 541 542 543class CodeTypeParser: 544 """A class used in InvalidCodesException to report an error if a code parser check fails""" 545 546 def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH): 547 548 if not trud_processed_dir.exists() or not trud_processed_dir.is_dir(): 549 raise FileNotFoundError( 550 f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install" 551 ) 552 553 self.code_types = { 554 "read2": Read2(), 555 "read3": Read3(), 556 "icd10": Icd10(), 557 "snomed": Snomed(), 558 "opcs4": Opcs4(), 559 "atc": Atc(), 560 "med": Med(), 561 "cprd": Cprd(), 562 }
17class CodesError: 18 """A class used in InvalidCodesException to report an error if a code parser check fails""" 19 20 def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None): 21 # initialise class variables with provided parameters 22 for key, value in locals().items(): 23 if key != "self": 24 setattr(self, key, value)
A class used in InvalidCodesException to report an error if a code parser check fails
27class InvalidCodesException(Exception): 28 """Custom exception class raised when invalid codes are found that cannot be resolved by processing""" 29 30 def __init__(self, error): 31 super().__init__(error.message) 32 self.error = error
Custom exception class raised when invalid codes are found that cannot be resolved by processing
35class Proto: 36 """ 37 Define checks as list of 3 tuple: (Message, Condition, Process) 38 - Message = The name of the condition (what is printed and logged) 39 - Condition = True if Passed, and False if Failed 40 - Process = Aims to resolve all issues that stop condition from passing (Do not change index!) 41 """ 42 43 checks: list[ 44 tuple[ 45 str, # The description, e.g., "Not Empty" 46 Callable[ 47 [list], 48 pd.Series, 49 ], # The first lambda function: takes a list and returns a pd.Series of booleans 50 Callable[ 51 [list, Path], 52 None, 53 ], # The second lambda function: takes a list and a string, and returns nothing 54 ] 55 ] 56 57 def __init__(self, name: str, trud_codes_path: Optional[Path] = None): 58 if trud_codes_path is not None: 59 if trud_codes_path.is_file(): 60 self.trud_codes_path: Path = trud_codes_path 61 self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path) 62 else: 63 raise FileNotFoundError( 64 f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly" 65 ) 66 67 self.name: str = name 68 69 def raise_exception(self, ex: Exception): 70 """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict""" 71 raise ex 72 73 def in_database( 74 self, codes: pd.DataFrame, db: pd.DataFrame, col: str 75 ) -> pd.DataFrame: 76 return codes.isin(db[col]) 77 78 def process( 79 self, codes: pd.DataFrame, codes_file: Path 80 ) -> Tuple[pd.DataFrame, list]: 81 """identify issues that do not pass and fix them with define/d process""" 82 errors = [] 83 # Iter through each item in check. 84 for msg, cond, fix in self.checks: 85 # Check if any codes fail the check to False 86 if not cond(codes).all(): 87 # Log the number of codes that failed 88 logger.debug( 89 f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix" 90 ) 91 # try fix errors by running lamba "process" function 92 try: 93 codes = fix(codes, codes_file) 94 logger.debug(f"Check: Fixed") 95 except InvalidCodesException as ex: 96 errors.append(ex.error) 97 else: 98 logger.debug(f"Check: passed") 99 100 return codes, errors 101 102 def verify(self, codes: pd.DataFrame, codes_file: Path): 103 """verify codes in codes file""" 104 conds = np.array([]) 105 106 # Iter through each item in check. 107 for msg, cond, process in self.checks: 108 # run conditional check 109 out = cond(codes) 110 conds = np.append(conds, out.all()) 111 112 return conds
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
57 def __init__(self, name: str, trud_codes_path: Optional[Path] = None): 58 if trud_codes_path is not None: 59 if trud_codes_path.is_file(): 60 self.trud_codes_path: Path = trud_codes_path 61 self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path) 62 else: 63 raise FileNotFoundError( 64 f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly" 65 ) 66 67 self.name: str = name
69 def raise_exception(self, ex: Exception): 70 """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict""" 71 raise ex
Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict
78 def process( 79 self, codes: pd.DataFrame, codes_file: Path 80 ) -> Tuple[pd.DataFrame, list]: 81 """identify issues that do not pass and fix them with define/d process""" 82 errors = [] 83 # Iter through each item in check. 84 for msg, cond, fix in self.checks: 85 # Check if any codes fail the check to False 86 if not cond(codes).all(): 87 # Log the number of codes that failed 88 logger.debug( 89 f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix" 90 ) 91 # try fix errors by running lamba "process" function 92 try: 93 codes = fix(codes, codes_file) 94 logger.debug(f"Check: Fixed") 95 except InvalidCodesException as ex: 96 errors.append(ex.error) 97 else: 98 logger.debug(f"Check: passed") 99 100 return codes, errors
identify issues that do not pass and fix them with define/d process
102 def verify(self, codes: pd.DataFrame, codes_file: Path): 103 """verify codes in codes file""" 104 conds = np.array([]) 105 106 # Iter through each item in check. 107 for msg, cond, process in self.checks: 108 # run conditional check 109 out = cond(codes) 110 conds = np.append(conds, out.all()) 111 112 return conds
verify codes in codes file
115class Read2(Proto): 116 """This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary.""" 117 118 def __init__(self): 119 super().__init__("read2", trud.PROCESSED_PATH / "read2.parquet") 120 121 # validate checks 122 self.checks = [ 123 ( 124 # check codes are not empty, if empty throw an exception 125 "Not Empty", 126 lambda codes: pd.Series([len(codes) > 0]), 127 lambda codes, codes_file: self.raise_exception( 128 InvalidCodesException( 129 CodesError( 130 f"Code list is empty", 131 codes=codes, 132 codes_file=codes_file, 133 mask=None, 134 code_type=self.name, 135 ) 136 ) 137 ), 138 ), 139 ( 140 # check codes <5 characters, if too short pads it with . (dots) to reach 5 characters 141 "Too Short", 142 lambda codes: ~(codes.str.len() < 5), 143 lambda codes, codes_file: codes.str.pad( 144 width=5, side="right", fillchar="." 145 ), 146 ), 147 ( 148 # check codes > 5 characters, If too long, truncates them to 5 characters 149 "Too Long", 150 lambda codes: ~(codes.str.len() > 5), 151 lambda codes, codes_file: codes.str[:5], 152 ), 153 ( 154 # checks codes contain numbers, or dots (.), if not logs invalid code error 155 "Alphanumeric Dot", 156 lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"), 157 lambda codes, codes_file: self.raise_exception( 158 InvalidCodesException( 159 CodesError( 160 f"Illegal code format, not alphanumeric dot", 161 codes=codes, 162 codes_file=codes_file, 163 mask=codes.str.match(r"^[a-zA-Z0-9.]+$"), 164 code_type=self.name, 165 ) 166 ) 167 ), 168 ), 169 ( 170 # checks code exists in self.db (the Read2 dataset). If missing log invalid codes. 171 "In Database", 172 lambda codes: self.in_database(codes, self.db, self.name), 173 lambda codes, codes_file: self.raise_exception( 174 InvalidCodesException( 175 CodesError( 176 f"Codes do not exist in database", 177 codes=codes, 178 codes_file=codes_file, 179 mask=self.in_database(codes, self.db, self.name), 180 code_type=self.name, 181 ) 182 ) 183 ), 184 ), 185 ]
This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary.
Inherited Members
188class Read3(Proto): 189 def __init__(self): 190 super().__init__("Read3", trud.PROCESSED_PATH / "read3.parquet") 191 192 self.checks = [ 193 ( 194 "Not Empty", 195 lambda codes: pd.Series([len(codes) > 0]), 196 lambda codes, codes_file: self.raise_exception( 197 InvalidCodesException( 198 CodesError( 199 f"Code list is empty", 200 codes=codes, 201 codes_file=codes_file, 202 mask=None, 203 code_type=self.name, 204 ) 205 ) 206 ), 207 ), 208 ( 209 "Too Short", 210 lambda codes: ~(codes.str.len() < 5), 211 lambda codes, codes_file: codes.str.pad( 212 width=5, side="right", fillchar="." 213 ), 214 ), 215 ( 216 "Too Long", 217 lambda codes: ~(codes.str.len() > 5), 218 lambda codes, codes_file: codes.str[:5], 219 ), 220 ( 221 "Alphanumeric Dot", 222 lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"), 223 lambda codes, codes_file: self.raise_exception( 224 InvalidCodesException( 225 CodesError( 226 f"QA Alphanumeric Dot", 227 codes=codes, 228 codes_file=codes_file, 229 check_regex=codes.str.match(r"^[a-zA-Z0-9.]+$"), 230 code_type=self.name, 231 ) 232 ) 233 ), 234 ), 235 ( 236 "In Database", 237 lambda codes: self.in_database(codes, self.db, self.name), 238 lambda codes, codes_file: self.raise_exception( 239 InvalidCodesException( 240 CodesError( 241 f"QA In Database", 242 codes=codes, 243 codes_file=codes_file, 244 check_regex=self.in_database(codes, self.db, self.name), 245 code_type=self.name, 246 ) 247 ) 248 ), 249 ), 250 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
253class Icd10(Proto): 254 def __init__(self): 255 super().__init__("icd10", trud.PROCESSED_PATH / "icd10.parquet") 256 257 self.checks = [ 258 ( 259 "Not Empty", 260 lambda codes: pd.Series([len(codes) > 0]), 261 lambda codes, codes_file: self.raise_exception( 262 InvalidCodesException( 263 CodesError( 264 f"Code list is empty {codes_file}", 265 codes=codes, 266 codes_file=codes_file, 267 mask=None, 268 code_type=self.name, 269 ) 270 ) 271 ), 272 ), 273 ( 274 "Too Short", 275 lambda codes: ~(codes.str.len() < 3), 276 lambda codes, codes_file: self.raise_exception( 277 InvalidCodesException( 278 CodesError( 279 f"QA Too Short", 280 codes=codes, 281 codes_file=codes_file, 282 mask=~(codes.str.len() < 3), 283 code_type=self.name, 284 ) 285 ) 286 ), 287 ), 288 ( 289 "Has Dot", 290 lambda codes: ~(codes.str.match(r".*\..*")), # check if contains dot 291 lambda codes, codes_file: codes.str.replace( 292 ".", "" 293 ), # delete any dots in string 294 # lambda codes : codes.str.split('\.').apply(lambda ls: ls[0]) #only get part before dot 295 ), 296 ( 297 "Alphanumeric Capital", 298 lambda codes: codes.str.match(r"^[A-Z0-9]+$"), 299 lambda codes, codes_file: self.raise_exception( 300 InvalidCodesException( 301 CodesError( 302 f"QA Alphanumeric Capital", 303 codes=codes, 304 codes_file=codes_file, 305 mask=codes.str.match(r"^[A-Z0-9]+$"), 306 code_type=self.name, 307 ) 308 ) 309 ), 310 ), 311 ( 312 "In Database", 313 lambda codes: ~( 314 ~self.in_database(codes, self.db, self.name) 315 & ~self.in_database(codes, self.db, self.name + "_alt") 316 ), 317 lambda codes, codes_file: self.raise_exception( 318 InvalidCodesException( 319 CodesError( 320 f"QA In Database", 321 codes=codes, 322 codes_file=codes_file, 323 mask=~( 324 ~self.in_database(codes, self.db, self.name) 325 & ~self.in_database(codes, self.db, self.name + "_alt") 326 ), 327 code_type=self.name, 328 ) 329 ) 330 ), 331 ), 332 # ( 333 # "ICD10 Regex", 334 # lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum* 335 # lambda codes : lc.log_invalid_code(codes, 336 # codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows 337 # code_type="icd10", 338 # 339 # ) 340 ] 341 342 def trim_icd10(codes: pd.DataFrame) -> pd.DataFrame: 343 codes = codes.str[:4] 344 return codes
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
347class Snomed(Proto): 348 def __init__(self): 349 super().__init__("snomed", trud.PROCESSED_PATH / "snomed.parquet") 350 351 self.checks = [ 352 # ( 353 # "Not Empty", 354 # lambda codes : pd.Series([len(codes) > 0]), 355 # lambda codes : raise_exception(Exception("Code List is Empty")) 356 # ), 357 ( 358 "Too Short", 359 lambda codes: ~(codes.str.len() < 6), 360 lambda codes, codes_file: self.raise_exception( 361 InvalidCodesException( 362 CodesError( 363 f"QA Too Short", 364 codes=codes, 365 codes_file=codes_file, 366 mask=~(codes.str.len() < 6), 367 code_type=self.name, 368 ) 369 ) 370 ), 371 ), 372 ( 373 "Too Long", 374 lambda codes: ~(codes.str.len() > 18), 375 lambda codes, codes_file: self.raise_exception( 376 InvalidCodesException( 377 CodesError( 378 f"QA Too Long", 379 codes=codes, 380 codes_file=codes_file, 381 mask=~(codes.str.len() > 18), 382 code_type=self.name, 383 ) 384 ) 385 ), 386 ), 387 ( 388 "Numeric", 389 lambda codes: codes.str.match(r"[0-9]+$"), 390 lambda codes, codes_file: self.raise_exception( 391 InvalidCodesException( 392 CodesError( 393 f"QA Numeric", 394 codes=codes, 395 codes_file=codes_file, 396 mask=codes.str.match(r"[0-9]+$"), 397 code_type=self.name, 398 ) 399 ) 400 ), 401 ), 402 # ( 403 # "Is Integer", 404 # lambda codes : codes.dtype == int, 405 # lambda codes : codes.astype(int) #Convert to integer 406 # ), 407 ( 408 "In Database", 409 lambda codes: self.in_database(codes, self.db, self.name), 410 lambda codes, codes_file: self.raise_exception( 411 InvalidCodesException( 412 CodesError( 413 f"QA In Database", 414 codes=codes, 415 codes_file=codes_file, 416 mask=self.in_database(codes, self.db, self.name), 417 code_type=self.name, 418 ) 419 ) 420 ), 421 ), 422 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
425class Opcs4(Proto): 426 def __init__(self): 427 super().__init__("opcs4", trud.PROCESSED_PATH / "opcs4.parquet") 428 429 self.checks = [ 430 ( 431 "Not Empty", 432 lambda codes: pd.Series([len(codes) > 0]), 433 lambda codes, codes_file: self.raise_exception( 434 InvalidCodesException( 435 CodesError( 436 f"Code list is empty", 437 codes=codes, 438 codes_file=codes_file, 439 mask=None, 440 code_type=self.name, 441 ) 442 ) 443 ), 444 ), 445 ( 446 "In Database", 447 lambda codes: self.in_database(codes, self.db, self.name), 448 lambda codes, codes_file: self.raise_exception( 449 InvalidCodesException( 450 CodesError( 451 f"QA In Database", 452 codes=codes, 453 codes_file=codes_file, 454 mask=self.in_database(codes, self.db, self.name), 455 code_type=self.name, 456 ) 457 ) 458 ), 459 ), 460 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
463class Atc(Proto): 464 def __init__(self): 465 super().__init__("atc", trud_codes_path=None) 466 self.checks = [ 467 ( 468 "Not Empty", 469 lambda codes: pd.Series([len(codes) > 0]), 470 lambda codes, codes_file: self.raise_exception( 471 InvalidCodesException( 472 CodesError( 473 f"Code list is empty", 474 codes=codes, 475 codes_file=codes_file, 476 mask=None, 477 code_type=self.name, 478 ) 479 ) 480 ), 481 ), 482 ( 483 "Alphanumeric Capital", 484 lambda codes: codes.str.match(r"^[A-Z0-9]+$"), 485 lambda codes, codes_file: self.raise_exception( 486 InvalidCodesException( 487 CodesError( 488 f"QA Alphanumeric Capital", 489 codes=codes, 490 codes_file=codes_file, 491 mask=codes.str.match(r"^[A-Z0-9]+$"), 492 code_type=self.name, 493 ) 494 ) 495 ), 496 ), 497 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
500class Med(Proto): 501 def __init__(self): 502 super().__init__("med", trud_codes_path=None) 503 self.checks = [ 504 ( 505 "Not Empty", 506 lambda codes: pd.Series([len(codes) > 0]), 507 lambda codes, codes_file: self.raise_exception( 508 InvalidCodesException( 509 CodesError( 510 f"Code list is empty", 511 codes=codes, 512 codes_file=codes_file, 513 mask=None, 514 code_type=self.name, 515 ) 516 ) 517 ), 518 ) 519 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
522class Cprd(Proto): 523 def __init__(self): 524 super().__init__("cprd", trud_codes_path=None) 525 self.checks = [ 526 ( 527 "Not Empty", 528 lambda codes: pd.Series([len(codes) > 0]), 529 lambda codes, codes_file: self.raise_exception( 530 InvalidCodesException( 531 CodesError( 532 f"Code list is empty", 533 codes=codes, 534 codes_file=codes_file, 535 mask=None, 536 code_type=self.name, 537 ) 538 ) 539 ), 540 ) 541 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
544class CodeTypeParser: 545 """A class used in InvalidCodesException to report an error if a code parser check fails""" 546 547 def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH): 548 549 if not trud_processed_dir.exists() or not trud_processed_dir.is_dir(): 550 raise FileNotFoundError( 551 f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install" 552 ) 553 554 self.code_types = { 555 "read2": Read2(), 556 "read3": Read3(), 557 "icd10": Icd10(), 558 "snomed": Snomed(), 559 "opcs4": Opcs4(), 560 "atc": Atc(), 561 "med": Med(), 562 "cprd": Cprd(), 563 }
A class used in InvalidCodesException to report an error if a code parser check fails
547 def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH): 548 549 if not trud_processed_dir.exists() or not trud_processed_dir.is_dir(): 550 raise FileNotFoundError( 551 f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install" 552 ) 553 554 self.code_types = { 555 "read2": Read2(), 556 "read3": Read3(), 557 "icd10": Icd10(), 558 "snomed": Snomed(), 559 "opcs4": Opcs4(), 560 "atc": Atc(), 561 "med": Med(), 562 "cprd": Cprd(), 563 }