acmc.parse
1import pandas as pd 2import numpy as np 3import os 4from typing import Callable, Optional, Tuple 5from pathlib import Path 6 7from acmc import trud, logging_config as lc 8 9# setup logging 10logger = lc.setup_logger() 11 12# Define allowed values 13SUPPORTED_CODE_TYPES = {"read2", "read3", "icd10", "snomed", "opcs4", "atc"} 14 15 16class CodesError: 17 """A class used in InvalidCodesException to report an error if a code parser check fails""" 18 19 def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None): 20 # initialise class variables with provided parameters 21 for key, value in locals().items(): 22 if key != "self": 23 setattr(self, key, value) 24 25 26class InvalidCodesException(Exception): 27 """Custom exception class raised when invalid codes are found that cannot be resolved by processing""" 28 29 def __init__(self, error): 30 super().__init__(error.message) 31 self.error = error 32 33 34class Proto: 35 """ 36 Define checks as list of 3 tuple: (Message, Condition, Process) 37 - Message = The name of the condition (what is printed and logged) 38 - Condition = True if Passed, and False if Failed 39 - Process = Aims to resolve all issues that stop condition from passing (Do not change index!) 40 """ 41 42 checks: list[ 43 tuple[ 44 str, # The description, e.g., "Not Empty" 45 Callable[ 46 [pd.DataFrame], 47 pd.Series, 48 ], # The first lambda function: takes a list and returns a pd.Series of booleans 49 Callable[ 50 [pd.DataFrame, Path], 51 pd.DataFrame, 52 ], # The second lambda function: takes a list and a string, and returns nothing 53 ] 54 ] 55 56 def __init__(self, name: str, trud_codes_path: Optional[Path] = None): 57 if trud_codes_path is not None: 58 if trud_codes_path.is_file(): 59 self.trud_codes_path: Path = trud_codes_path 60 self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path) 61 else: 62 raise FileNotFoundError( 63 f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly" 64 ) 65 66 self.name: str = name 67 68 def raise_exception(self, ex: Exception): 69 """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict""" 70 raise ex 71 72 def in_database( 73 self, codes: pd.DataFrame, db: pd.DataFrame, col: str 74 ) -> pd.DataFrame: 75 return codes.isin(db[col]) 76 77 def process( 78 self, codes: pd.DataFrame, codes_file: Path 79 ) -> Tuple[pd.DataFrame, list]: 80 """identify issues that do not pass and fix them with define/d process""" 81 errors = [] 82 # Iter through each item in check. 83 for msg, cond, fix in self.checks: 84 # Check if any codes fail the check to False 85 if not cond(codes).all(): 86 # Log the number of codes that failed 87 logger.debug( 88 f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix" 89 ) 90 # try fix errors by running lamba "process" function 91 try: 92 codes = fix(codes, codes_file) 93 logger.debug(f"Check: Fixed") 94 except InvalidCodesException as ex: 95 errors.append(ex.error) 96 else: 97 logger.debug(f"Check: passed") 98 99 return codes, errors 100 101 def verify(self, codes: pd.DataFrame, codes_file: Path): 102 """verify codes in codes file""" 103 conds = np.array([]) 104 105 # Iter through each item in check. 106 for msg, cond, process in self.checks: 107 # run conditional check 108 out = cond(codes) 109 conds = np.append(conds, out.all()) 110 111 return conds 112 113 114class Read2(Proto): 115 """This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary.""" 116 117 def __init__(self): 118 super().__init__("read2", trud.PROCESSED_PATH / "read2.parquet") 119 120 # validate checks 121 self.checks = [ 122 ( 123 # check codes are not empty, if empty throw an exception 124 "Not Empty", 125 lambda codes: pd.Series([len(codes) > 0]), 126 lambda codes, codes_file: self.raise_exception( 127 InvalidCodesException( 128 CodesError( 129 f"Code list is empty", 130 codes=codes, 131 codes_file=codes_file, 132 mask=None, 133 code_type=self.name, 134 ) 135 ) 136 ), 137 ), 138 ( 139 # check codes <5 characters, if too short pads it with . (dots) to reach 5 characters 140 "Too Short", 141 lambda codes: ~(codes.str.len() < 5), 142 lambda codes, codes_file: codes.str.pad( 143 width=5, side="right", fillchar="." 144 ), 145 ), 146 ( 147 # check codes > 5 characters, If too long, truncates them to 5 characters 148 "Too Long", 149 lambda codes: ~(codes.str.len() > 5), 150 lambda codes, codes_file: codes.str[:5], 151 ), 152 ( 153 # checks codes contain numbers, or dots (.), if not logs invalid code error 154 "Alphanumeric Dot", 155 lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"), 156 lambda codes, codes_file: self.raise_exception( 157 InvalidCodesException( 158 CodesError( 159 f"Illegal code format, not alphanumeric dot", 160 codes=codes, 161 codes_file=codes_file, 162 mask=codes.str.match(r"^[a-zA-Z0-9.]+$"), 163 code_type=self.name, 164 ) 165 ) 166 ), 167 ), 168 ( 169 # checks code exists in self.db (the Read2 dataset). If missing log invalid codes. 170 "In Database", 171 lambda codes: self.in_database(codes, self.db, self.name), 172 lambda codes, codes_file: self.raise_exception( 173 InvalidCodesException( 174 CodesError( 175 f"Codes do not exist in database", 176 codes=codes, 177 codes_file=codes_file, 178 mask=self.in_database(codes, self.db, self.name), 179 code_type=self.name, 180 ) 181 ) 182 ), 183 ), 184 ] 185 186 187class Read3(Proto): 188 def __init__(self): 189 super().__init__("Read3", trud.PROCESSED_PATH / "read3.parquet") 190 191 self.checks = [ 192 ( 193 "Not Empty", 194 lambda codes: pd.Series([len(codes) > 0]), 195 lambda codes, codes_file: self.raise_exception( 196 InvalidCodesException( 197 CodesError( 198 f"Code list is empty", 199 codes=codes, 200 codes_file=codes_file, 201 mask=None, 202 code_type=self.name, 203 ) 204 ) 205 ), 206 ), 207 ( 208 "Too Short", 209 lambda codes: ~(codes.str.len() < 5), 210 lambda codes, codes_file: codes.str.pad( 211 width=5, side="right", fillchar="." 212 ), 213 ), 214 ( 215 "Too Long", 216 lambda codes: ~(codes.str.len() > 5), 217 lambda codes, codes_file: codes.str[:5], 218 ), 219 ( 220 "Alphanumeric Dot", 221 lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"), 222 lambda codes, codes_file: self.raise_exception( 223 InvalidCodesException( 224 CodesError( 225 f"QA Alphanumeric Dot", 226 codes=codes, 227 codes_file=codes_file, 228 check_regex=codes.str.match(r"^[a-zA-Z0-9.]+$"), 229 code_type=self.name, 230 ) 231 ) 232 ), 233 ), 234 ( 235 "In Database", 236 lambda codes: self.in_database(codes, self.db, self.name), 237 lambda codes, codes_file: self.raise_exception( 238 InvalidCodesException( 239 CodesError( 240 f"QA In Database", 241 codes=codes, 242 codes_file=codes_file, 243 check_regex=self.in_database(codes, self.db, self.name), 244 code_type=self.name, 245 ) 246 ) 247 ), 248 ), 249 ] 250 251 252class Icd10(Proto): 253 def __init__(self): 254 super().__init__("icd10", trud.PROCESSED_PATH / "icd10.parquet") 255 256 self.checks = [ 257 ( 258 "Not Empty", 259 lambda codes: pd.Series([len(codes) > 0]), 260 lambda codes, codes_file: self.raise_exception( 261 InvalidCodesException( 262 CodesError( 263 f"Code list is empty {codes_file}", 264 codes=codes, 265 codes_file=codes_file, 266 mask=None, 267 code_type=self.name, 268 ) 269 ) 270 ), 271 ), 272 ( 273 "Too Short", 274 lambda codes: ~(codes.str.len() < 3), 275 lambda codes, codes_file: self.raise_exception( 276 InvalidCodesException( 277 CodesError( 278 f"QA Too Short", 279 codes=codes, 280 codes_file=codes_file, 281 mask=~(codes.str.len() < 3), 282 code_type=self.name, 283 ) 284 ) 285 ), 286 ), 287 ( 288 "Has Dot", 289 lambda codes: ~(codes.str.match(r".*\..*")), # check if contains dot 290 lambda codes, codes_file: codes.str.replace( 291 ".", "" 292 ), # delete any dots in string 293 # lambda codes : codes.str.split('\.').apply(lambda ls: ls[0]) #only get part before dot 294 ), 295 ( 296 "Alphanumeric Capital", 297 lambda codes: codes.str.match(r"^[A-Z0-9]+$"), 298 lambda codes, codes_file: self.raise_exception( 299 InvalidCodesException( 300 CodesError( 301 f"QA Alphanumeric Capital", 302 codes=codes, 303 codes_file=codes_file, 304 mask=codes.str.match(r"^[A-Z0-9]+$"), 305 code_type=self.name, 306 ) 307 ) 308 ), 309 ), 310 ( 311 "In Database", 312 lambda codes: ~( 313 ~self.in_database(codes, self.db, self.name) 314 & ~self.in_database(codes, self.db, self.name + "_alt") 315 ), 316 lambda codes, codes_file: self.raise_exception( 317 InvalidCodesException( 318 CodesError( 319 f"QA In Database", 320 codes=codes, 321 codes_file=codes_file, 322 mask=~( 323 ~self.in_database(codes, self.db, self.name) 324 & ~self.in_database(codes, self.db, self.name + "_alt") 325 ), 326 code_type=self.name, 327 ) 328 ) 329 ), 330 ), 331 # ( 332 # "ICD10 Regex", 333 # lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum* 334 # lambda codes : lc.log_invalid_code(codes, 335 # codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows 336 # code_type="icd10", 337 # 338 # ) 339 ] 340 341 342class Snomed(Proto): 343 def __init__(self): 344 super().__init__("snomed", trud.PROCESSED_PATH / "snomed.parquet") 345 346 self.checks = [ 347 # ( 348 # "Not Empty", 349 # lambda codes : pd.Series([len(codes) > 0]), 350 # lambda codes : raise_exception(Exception("Code List is Empty")) 351 # ), 352 ( 353 "Too Short", 354 lambda codes: ~(codes.str.len() < 6), 355 lambda codes, codes_file: self.raise_exception( 356 InvalidCodesException( 357 CodesError( 358 f"QA Too Short", 359 codes=codes, 360 codes_file=codes_file, 361 mask=~(codes.str.len() < 6), 362 code_type=self.name, 363 ) 364 ) 365 ), 366 ), 367 ( 368 "Too Long", 369 lambda codes: ~(codes.str.len() > 18), 370 lambda codes, codes_file: self.raise_exception( 371 InvalidCodesException( 372 CodesError( 373 f"QA Too Long", 374 codes=codes, 375 codes_file=codes_file, 376 mask=~(codes.str.len() > 18), 377 code_type=self.name, 378 ) 379 ) 380 ), 381 ), 382 ( 383 "Numeric", 384 lambda codes: codes.str.match(r"[0-9]+$"), 385 lambda codes, codes_file: self.raise_exception( 386 InvalidCodesException( 387 CodesError( 388 f"QA Numeric", 389 codes=codes, 390 codes_file=codes_file, 391 mask=codes.str.match(r"[0-9]+$"), 392 code_type=self.name, 393 ) 394 ) 395 ), 396 ), 397 # ( 398 # "Is Integer", 399 # lambda codes : codes.dtype == int, 400 # lambda codes : codes.astype(int) #Convert to integer 401 # ), 402 ( 403 "In Database", 404 lambda codes: self.in_database(codes, self.db, self.name), 405 lambda codes, codes_file: self.raise_exception( 406 InvalidCodesException( 407 CodesError( 408 f"QA In Database", 409 codes=codes, 410 codes_file=codes_file, 411 mask=self.in_database(codes, self.db, self.name), 412 code_type=self.name, 413 ) 414 ) 415 ), 416 ), 417 ] 418 419 420class Opcs4(Proto): 421 def __init__(self): 422 super().__init__("opcs4", trud.PROCESSED_PATH / "opcs4.parquet") 423 424 self.checks = [ 425 ( 426 "Not Empty", 427 lambda codes: pd.Series([len(codes) > 0]), 428 lambda codes, codes_file: self.raise_exception( 429 InvalidCodesException( 430 CodesError( 431 f"Code list is empty", 432 codes=codes, 433 codes_file=codes_file, 434 mask=None, 435 code_type=self.name, 436 ) 437 ) 438 ), 439 ), 440 ( 441 "In Database", 442 lambda codes: self.in_database(codes, self.db, self.name), 443 lambda codes, codes_file: self.raise_exception( 444 InvalidCodesException( 445 CodesError( 446 f"QA In Database", 447 codes=codes, 448 codes_file=codes_file, 449 mask=self.in_database(codes, self.db, self.name), 450 code_type=self.name, 451 ) 452 ) 453 ), 454 ), 455 ] 456 457 458class Atc(Proto): 459 def __init__(self): 460 super().__init__("atc", trud_codes_path=None) 461 self.checks = [ 462 ( 463 "Not Empty", 464 lambda codes: pd.Series([len(codes) > 0]), 465 lambda codes, codes_file: self.raise_exception( 466 InvalidCodesException( 467 CodesError( 468 f"Code list is empty", 469 codes=codes, 470 codes_file=codes_file, 471 mask=None, 472 code_type=self.name, 473 ) 474 ) 475 ), 476 ), 477 ( 478 "Alphanumeric Capital", 479 lambda codes: codes.str.match(r"^[A-Z0-9]+$"), 480 lambda codes, codes_file: self.raise_exception( 481 InvalidCodesException( 482 CodesError( 483 f"QA Alphanumeric Capital", 484 codes=codes, 485 codes_file=codes_file, 486 mask=codes.str.match(r"^[A-Z0-9]+$"), 487 code_type=self.name, 488 ) 489 ) 490 ), 491 ), 492 ] 493 494 495class Med(Proto): 496 def __init__(self): 497 super().__init__("med", trud_codes_path=None) 498 self.checks = [ 499 ( 500 "Not Empty", 501 lambda codes: pd.Series([len(codes) > 0]), 502 lambda codes, codes_file: self.raise_exception( 503 InvalidCodesException( 504 CodesError( 505 f"Code list is empty", 506 codes=codes, 507 codes_file=codes_file, 508 mask=None, 509 code_type=self.name, 510 ) 511 ) 512 ), 513 ) 514 ] 515 516 517class Cprd(Proto): 518 def __init__(self): 519 super().__init__("cprd", trud_codes_path=None) 520 self.checks = [ 521 ( 522 "Not Empty", 523 lambda codes: pd.Series([len(codes) > 0]), 524 lambda codes, codes_file: self.raise_exception( 525 InvalidCodesException( 526 CodesError( 527 f"Code list is empty", 528 codes=codes, 529 codes_file=codes_file, 530 mask=None, 531 code_type=self.name, 532 ) 533 ) 534 ), 535 ) 536 ] 537 538 539class CodeTypeParser: 540 """A class used in InvalidCodesException to report an error if a code parser check fails""" 541 542 def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH): 543 if not trud_processed_dir.exists() or not trud_processed_dir.is_dir(): 544 raise FileNotFoundError( 545 f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install" 546 ) 547 548 self.code_types = { 549 "read2": Read2(), 550 "read3": Read3(), 551 "icd10": Icd10(), 552 "snomed": Snomed(), 553 "opcs4": Opcs4(), 554 "atc": Atc(), 555 "med": Med(), 556 "cprd": Cprd(), 557 }
17class CodesError: 18 """A class used in InvalidCodesException to report an error if a code parser check fails""" 19 20 def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None): 21 # initialise class variables with provided parameters 22 for key, value in locals().items(): 23 if key != "self": 24 setattr(self, key, value)
A class used in InvalidCodesException to report an error if a code parser check fails
27class InvalidCodesException(Exception): 28 """Custom exception class raised when invalid codes are found that cannot be resolved by processing""" 29 30 def __init__(self, error): 31 super().__init__(error.message) 32 self.error = error
Custom exception class raised when invalid codes are found that cannot be resolved by processing
35class Proto: 36 """ 37 Define checks as list of 3 tuple: (Message, Condition, Process) 38 - Message = The name of the condition (what is printed and logged) 39 - Condition = True if Passed, and False if Failed 40 - Process = Aims to resolve all issues that stop condition from passing (Do not change index!) 41 """ 42 43 checks: list[ 44 tuple[ 45 str, # The description, e.g., "Not Empty" 46 Callable[ 47 [pd.DataFrame], 48 pd.Series, 49 ], # The first lambda function: takes a list and returns a pd.Series of booleans 50 Callable[ 51 [pd.DataFrame, Path], 52 pd.DataFrame, 53 ], # The second lambda function: takes a list and a string, and returns nothing 54 ] 55 ] 56 57 def __init__(self, name: str, trud_codes_path: Optional[Path] = None): 58 if trud_codes_path is not None: 59 if trud_codes_path.is_file(): 60 self.trud_codes_path: Path = trud_codes_path 61 self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path) 62 else: 63 raise FileNotFoundError( 64 f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly" 65 ) 66 67 self.name: str = name 68 69 def raise_exception(self, ex: Exception): 70 """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict""" 71 raise ex 72 73 def in_database( 74 self, codes: pd.DataFrame, db: pd.DataFrame, col: str 75 ) -> pd.DataFrame: 76 return codes.isin(db[col]) 77 78 def process( 79 self, codes: pd.DataFrame, codes_file: Path 80 ) -> Tuple[pd.DataFrame, list]: 81 """identify issues that do not pass and fix them with define/d process""" 82 errors = [] 83 # Iter through each item in check. 84 for msg, cond, fix in self.checks: 85 # Check if any codes fail the check to False 86 if not cond(codes).all(): 87 # Log the number of codes that failed 88 logger.debug( 89 f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix" 90 ) 91 # try fix errors by running lamba "process" function 92 try: 93 codes = fix(codes, codes_file) 94 logger.debug(f"Check: Fixed") 95 except InvalidCodesException as ex: 96 errors.append(ex.error) 97 else: 98 logger.debug(f"Check: passed") 99 100 return codes, errors 101 102 def verify(self, codes: pd.DataFrame, codes_file: Path): 103 """verify codes in codes file""" 104 conds = np.array([]) 105 106 # Iter through each item in check. 107 for msg, cond, process in self.checks: 108 # run conditional check 109 out = cond(codes) 110 conds = np.append(conds, out.all()) 111 112 return conds
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
57 def __init__(self, name: str, trud_codes_path: Optional[Path] = None): 58 if trud_codes_path is not None: 59 if trud_codes_path.is_file(): 60 self.trud_codes_path: Path = trud_codes_path 61 self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path) 62 else: 63 raise FileNotFoundError( 64 f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly" 65 ) 66 67 self.name: str = name
69 def raise_exception(self, ex: Exception): 70 """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict""" 71 raise ex
Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict
78 def process( 79 self, codes: pd.DataFrame, codes_file: Path 80 ) -> Tuple[pd.DataFrame, list]: 81 """identify issues that do not pass and fix them with define/d process""" 82 errors = [] 83 # Iter through each item in check. 84 for msg, cond, fix in self.checks: 85 # Check if any codes fail the check to False 86 if not cond(codes).all(): 87 # Log the number of codes that failed 88 logger.debug( 89 f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix" 90 ) 91 # try fix errors by running lamba "process" function 92 try: 93 codes = fix(codes, codes_file) 94 logger.debug(f"Check: Fixed") 95 except InvalidCodesException as ex: 96 errors.append(ex.error) 97 else: 98 logger.debug(f"Check: passed") 99 100 return codes, errors
identify issues that do not pass and fix them with define/d process
102 def verify(self, codes: pd.DataFrame, codes_file: Path): 103 """verify codes in codes file""" 104 conds = np.array([]) 105 106 # Iter through each item in check. 107 for msg, cond, process in self.checks: 108 # run conditional check 109 out = cond(codes) 110 conds = np.append(conds, out.all()) 111 112 return conds
verify codes in codes file
115class Read2(Proto): 116 """This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary.""" 117 118 def __init__(self): 119 super().__init__("read2", trud.PROCESSED_PATH / "read2.parquet") 120 121 # validate checks 122 self.checks = [ 123 ( 124 # check codes are not empty, if empty throw an exception 125 "Not Empty", 126 lambda codes: pd.Series([len(codes) > 0]), 127 lambda codes, codes_file: self.raise_exception( 128 InvalidCodesException( 129 CodesError( 130 f"Code list is empty", 131 codes=codes, 132 codes_file=codes_file, 133 mask=None, 134 code_type=self.name, 135 ) 136 ) 137 ), 138 ), 139 ( 140 # check codes <5 characters, if too short pads it with . (dots) to reach 5 characters 141 "Too Short", 142 lambda codes: ~(codes.str.len() < 5), 143 lambda codes, codes_file: codes.str.pad( 144 width=5, side="right", fillchar="." 145 ), 146 ), 147 ( 148 # check codes > 5 characters, If too long, truncates them to 5 characters 149 "Too Long", 150 lambda codes: ~(codes.str.len() > 5), 151 lambda codes, codes_file: codes.str[:5], 152 ), 153 ( 154 # checks codes contain numbers, or dots (.), if not logs invalid code error 155 "Alphanumeric Dot", 156 lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"), 157 lambda codes, codes_file: self.raise_exception( 158 InvalidCodesException( 159 CodesError( 160 f"Illegal code format, not alphanumeric dot", 161 codes=codes, 162 codes_file=codes_file, 163 mask=codes.str.match(r"^[a-zA-Z0-9.]+$"), 164 code_type=self.name, 165 ) 166 ) 167 ), 168 ), 169 ( 170 # checks code exists in self.db (the Read2 dataset). If missing log invalid codes. 171 "In Database", 172 lambda codes: self.in_database(codes, self.db, self.name), 173 lambda codes, codes_file: self.raise_exception( 174 InvalidCodesException( 175 CodesError( 176 f"Codes do not exist in database", 177 codes=codes, 178 codes_file=codes_file, 179 mask=self.in_database(codes, self.db, self.name), 180 code_type=self.name, 181 ) 182 ) 183 ), 184 ), 185 ]
This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary.
Inherited Members
188class Read3(Proto): 189 def __init__(self): 190 super().__init__("Read3", trud.PROCESSED_PATH / "read3.parquet") 191 192 self.checks = [ 193 ( 194 "Not Empty", 195 lambda codes: pd.Series([len(codes) > 0]), 196 lambda codes, codes_file: self.raise_exception( 197 InvalidCodesException( 198 CodesError( 199 f"Code list is empty", 200 codes=codes, 201 codes_file=codes_file, 202 mask=None, 203 code_type=self.name, 204 ) 205 ) 206 ), 207 ), 208 ( 209 "Too Short", 210 lambda codes: ~(codes.str.len() < 5), 211 lambda codes, codes_file: codes.str.pad( 212 width=5, side="right", fillchar="." 213 ), 214 ), 215 ( 216 "Too Long", 217 lambda codes: ~(codes.str.len() > 5), 218 lambda codes, codes_file: codes.str[:5], 219 ), 220 ( 221 "Alphanumeric Dot", 222 lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"), 223 lambda codes, codes_file: self.raise_exception( 224 InvalidCodesException( 225 CodesError( 226 f"QA Alphanumeric Dot", 227 codes=codes, 228 codes_file=codes_file, 229 check_regex=codes.str.match(r"^[a-zA-Z0-9.]+$"), 230 code_type=self.name, 231 ) 232 ) 233 ), 234 ), 235 ( 236 "In Database", 237 lambda codes: self.in_database(codes, self.db, self.name), 238 lambda codes, codes_file: self.raise_exception( 239 InvalidCodesException( 240 CodesError( 241 f"QA In Database", 242 codes=codes, 243 codes_file=codes_file, 244 check_regex=self.in_database(codes, self.db, self.name), 245 code_type=self.name, 246 ) 247 ) 248 ), 249 ), 250 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
253class Icd10(Proto): 254 def __init__(self): 255 super().__init__("icd10", trud.PROCESSED_PATH / "icd10.parquet") 256 257 self.checks = [ 258 ( 259 "Not Empty", 260 lambda codes: pd.Series([len(codes) > 0]), 261 lambda codes, codes_file: self.raise_exception( 262 InvalidCodesException( 263 CodesError( 264 f"Code list is empty {codes_file}", 265 codes=codes, 266 codes_file=codes_file, 267 mask=None, 268 code_type=self.name, 269 ) 270 ) 271 ), 272 ), 273 ( 274 "Too Short", 275 lambda codes: ~(codes.str.len() < 3), 276 lambda codes, codes_file: self.raise_exception( 277 InvalidCodesException( 278 CodesError( 279 f"QA Too Short", 280 codes=codes, 281 codes_file=codes_file, 282 mask=~(codes.str.len() < 3), 283 code_type=self.name, 284 ) 285 ) 286 ), 287 ), 288 ( 289 "Has Dot", 290 lambda codes: ~(codes.str.match(r".*\..*")), # check if contains dot 291 lambda codes, codes_file: codes.str.replace( 292 ".", "" 293 ), # delete any dots in string 294 # lambda codes : codes.str.split('\.').apply(lambda ls: ls[0]) #only get part before dot 295 ), 296 ( 297 "Alphanumeric Capital", 298 lambda codes: codes.str.match(r"^[A-Z0-9]+$"), 299 lambda codes, codes_file: self.raise_exception( 300 InvalidCodesException( 301 CodesError( 302 f"QA Alphanumeric Capital", 303 codes=codes, 304 codes_file=codes_file, 305 mask=codes.str.match(r"^[A-Z0-9]+$"), 306 code_type=self.name, 307 ) 308 ) 309 ), 310 ), 311 ( 312 "In Database", 313 lambda codes: ~( 314 ~self.in_database(codes, self.db, self.name) 315 & ~self.in_database(codes, self.db, self.name + "_alt") 316 ), 317 lambda codes, codes_file: self.raise_exception( 318 InvalidCodesException( 319 CodesError( 320 f"QA In Database", 321 codes=codes, 322 codes_file=codes_file, 323 mask=~( 324 ~self.in_database(codes, self.db, self.name) 325 & ~self.in_database(codes, self.db, self.name + "_alt") 326 ), 327 code_type=self.name, 328 ) 329 ) 330 ), 331 ), 332 # ( 333 # "ICD10 Regex", 334 # lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum* 335 # lambda codes : lc.log_invalid_code(codes, 336 # codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows 337 # code_type="icd10", 338 # 339 # ) 340 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
343class Snomed(Proto): 344 def __init__(self): 345 super().__init__("snomed", trud.PROCESSED_PATH / "snomed.parquet") 346 347 self.checks = [ 348 # ( 349 # "Not Empty", 350 # lambda codes : pd.Series([len(codes) > 0]), 351 # lambda codes : raise_exception(Exception("Code List is Empty")) 352 # ), 353 ( 354 "Too Short", 355 lambda codes: ~(codes.str.len() < 6), 356 lambda codes, codes_file: self.raise_exception( 357 InvalidCodesException( 358 CodesError( 359 f"QA Too Short", 360 codes=codes, 361 codes_file=codes_file, 362 mask=~(codes.str.len() < 6), 363 code_type=self.name, 364 ) 365 ) 366 ), 367 ), 368 ( 369 "Too Long", 370 lambda codes: ~(codes.str.len() > 18), 371 lambda codes, codes_file: self.raise_exception( 372 InvalidCodesException( 373 CodesError( 374 f"QA Too Long", 375 codes=codes, 376 codes_file=codes_file, 377 mask=~(codes.str.len() > 18), 378 code_type=self.name, 379 ) 380 ) 381 ), 382 ), 383 ( 384 "Numeric", 385 lambda codes: codes.str.match(r"[0-9]+$"), 386 lambda codes, codes_file: self.raise_exception( 387 InvalidCodesException( 388 CodesError( 389 f"QA Numeric", 390 codes=codes, 391 codes_file=codes_file, 392 mask=codes.str.match(r"[0-9]+$"), 393 code_type=self.name, 394 ) 395 ) 396 ), 397 ), 398 # ( 399 # "Is Integer", 400 # lambda codes : codes.dtype == int, 401 # lambda codes : codes.astype(int) #Convert to integer 402 # ), 403 ( 404 "In Database", 405 lambda codes: self.in_database(codes, self.db, self.name), 406 lambda codes, codes_file: self.raise_exception( 407 InvalidCodesException( 408 CodesError( 409 f"QA In Database", 410 codes=codes, 411 codes_file=codes_file, 412 mask=self.in_database(codes, self.db, self.name), 413 code_type=self.name, 414 ) 415 ) 416 ), 417 ), 418 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
421class Opcs4(Proto): 422 def __init__(self): 423 super().__init__("opcs4", trud.PROCESSED_PATH / "opcs4.parquet") 424 425 self.checks = [ 426 ( 427 "Not Empty", 428 lambda codes: pd.Series([len(codes) > 0]), 429 lambda codes, codes_file: self.raise_exception( 430 InvalidCodesException( 431 CodesError( 432 f"Code list is empty", 433 codes=codes, 434 codes_file=codes_file, 435 mask=None, 436 code_type=self.name, 437 ) 438 ) 439 ), 440 ), 441 ( 442 "In Database", 443 lambda codes: self.in_database(codes, self.db, self.name), 444 lambda codes, codes_file: self.raise_exception( 445 InvalidCodesException( 446 CodesError( 447 f"QA In Database", 448 codes=codes, 449 codes_file=codes_file, 450 mask=self.in_database(codes, self.db, self.name), 451 code_type=self.name, 452 ) 453 ) 454 ), 455 ), 456 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
459class Atc(Proto): 460 def __init__(self): 461 super().__init__("atc", trud_codes_path=None) 462 self.checks = [ 463 ( 464 "Not Empty", 465 lambda codes: pd.Series([len(codes) > 0]), 466 lambda codes, codes_file: self.raise_exception( 467 InvalidCodesException( 468 CodesError( 469 f"Code list is empty", 470 codes=codes, 471 codes_file=codes_file, 472 mask=None, 473 code_type=self.name, 474 ) 475 ) 476 ), 477 ), 478 ( 479 "Alphanumeric Capital", 480 lambda codes: codes.str.match(r"^[A-Z0-9]+$"), 481 lambda codes, codes_file: self.raise_exception( 482 InvalidCodesException( 483 CodesError( 484 f"QA Alphanumeric Capital", 485 codes=codes, 486 codes_file=codes_file, 487 mask=codes.str.match(r"^[A-Z0-9]+$"), 488 code_type=self.name, 489 ) 490 ) 491 ), 492 ), 493 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
496class Med(Proto): 497 def __init__(self): 498 super().__init__("med", trud_codes_path=None) 499 self.checks = [ 500 ( 501 "Not Empty", 502 lambda codes: pd.Series([len(codes) > 0]), 503 lambda codes, codes_file: self.raise_exception( 504 InvalidCodesException( 505 CodesError( 506 f"Code list is empty", 507 codes=codes, 508 codes_file=codes_file, 509 mask=None, 510 code_type=self.name, 511 ) 512 ) 513 ), 514 ) 515 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
518class Cprd(Proto): 519 def __init__(self): 520 super().__init__("cprd", trud_codes_path=None) 521 self.checks = [ 522 ( 523 "Not Empty", 524 lambda codes: pd.Series([len(codes) > 0]), 525 lambda codes, codes_file: self.raise_exception( 526 InvalidCodesException( 527 CodesError( 528 f"Code list is empty", 529 codes=codes, 530 codes_file=codes_file, 531 mask=None, 532 code_type=self.name, 533 ) 534 ) 535 ), 536 ) 537 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
540class CodeTypeParser: 541 """A class used in InvalidCodesException to report an error if a code parser check fails""" 542 543 def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH): 544 if not trud_processed_dir.exists() or not trud_processed_dir.is_dir(): 545 raise FileNotFoundError( 546 f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install" 547 ) 548 549 self.code_types = { 550 "read2": Read2(), 551 "read3": Read3(), 552 "icd10": Icd10(), 553 "snomed": Snomed(), 554 "opcs4": Opcs4(), 555 "atc": Atc(), 556 "med": Med(), 557 "cprd": Cprd(), 558 }
A class used in InvalidCodesException to report an error if a code parser check fails
543 def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH): 544 if not trud_processed_dir.exists() or not trud_processed_dir.is_dir(): 545 raise FileNotFoundError( 546 f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install" 547 ) 548 549 self.code_types = { 550 "read2": Read2(), 551 "read3": Read3(), 552 "icd10": Icd10(), 553 "snomed": Snomed(), 554 "opcs4": Opcs4(), 555 "atc": Atc(), 556 "med": Med(), 557 "cprd": Cprd(), 558 }