acmc.parse

  1import pandas as pd
  2import numpy as np
  3import os
  4from typing import Callable, Optional, Tuple
  5from pathlib import Path
  6
  7from acmc import trud, logging_config as lc
  8
  9# setup logging
 10logger = lc.setup_logger()
 11
 12# Define allowed values
 13SUPPORTED_CODE_TYPES = {"read2", "read3", "icd10", "snomed", "opcs4", "atc"}
 14
 15
 16class CodesError:
 17    """A class used in InvalidCodesException to report an error if a code parser check fails"""
 18
 19    def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None):
 20        # initialise class variables with provided parameters
 21        for key, value in locals().items():
 22            if key != "self":
 23                setattr(self, key, value)
 24
 25
 26class InvalidCodesException(Exception):
 27    """Custom exception class raised when invalid codes are found that cannot be resolved by processing"""
 28
 29    def __init__(self, error):
 30        super().__init__(error.message)
 31        self.error = error
 32
 33
 34class Proto:
 35    """
 36    Define checks as list of 3 tuple: (Message, Condition, Process)
 37    - Message = The name of the condition (what is printed and logged)
 38    - Condition = True if Passed, and False if Failed
 39    - Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
 40    """
 41
 42    checks: list[
 43        tuple[
 44            str,  # The description, e.g., "Not Empty"
 45            Callable[
 46                [list],
 47                pd.Series,
 48            ],  # The first lambda function: takes a list and returns a pd.Series of booleans
 49            Callable[
 50                [list, Path],
 51                None,
 52            ],  # The second lambda function: takes a list and a string, and returns nothing
 53        ]
 54    ]
 55
 56    def __init__(self, name: str, trud_codes_path: Optional[Path] = None):
 57        if trud_codes_path is not None:
 58            if trud_codes_path.is_file():
 59                self.trud_codes_path: Path = trud_codes_path
 60                self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path)
 61            else:
 62                raise FileNotFoundError(
 63                    f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly"
 64                )
 65
 66        self.name: str = name
 67
 68    def raise_exception(self, ex: Exception):
 69        """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict"""
 70        raise ex
 71
 72    def in_database(
 73        self, codes: pd.DataFrame, db: pd.DataFrame, col: str
 74    ) -> pd.DataFrame:
 75        return codes.isin(db[col])
 76
 77    def process(
 78        self, codes: pd.DataFrame, codes_file: Path
 79    ) -> Tuple[pd.DataFrame, list]:
 80        """identify issues that do not pass and fix them with define/d process"""
 81        errors = []
 82        # Iter through each item in check.
 83        for msg, cond, fix in self.checks:
 84            # Check if any codes fail the check to False
 85            if not cond(codes).all():
 86                # Log the number of codes that failed
 87                logger.debug(
 88                    f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix"
 89                )
 90                # try fix errors by running lamba "process" function
 91                try:
 92                    codes = fix(codes, codes_file)
 93                    logger.debug(f"Check: Fixed")
 94                except InvalidCodesException as ex:
 95                    errors.append(ex.error)
 96            else:
 97                logger.debug(f"Check: passed")
 98
 99        return codes, errors
100
101    def verify(self, codes: pd.DataFrame, codes_file: Path):
102        """verify codes in codes file"""
103        conds = np.array([])
104
105        # Iter through each item in check.
106        for msg, cond, process in self.checks:
107            # run conditional check
108            out = cond(codes)
109            conds = np.append(conds, out.all())
110
111        return conds
112
113
114class Read2(Proto):
115    """This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary."""
116
117    def __init__(self):
118        super().__init__("read2", trud.PROCESSED_PATH / "read2.parquet")
119
120        # validate checks
121        self.checks = [
122            (
123                # check codes are not empty, if empty throw an exception
124                "Not Empty",
125                lambda codes: pd.Series([len(codes) > 0]),
126                lambda codes, codes_file: self.raise_exception(
127                    InvalidCodesException(
128                        CodesError(
129                            f"Code list is empty",
130                            codes=codes,
131                            codes_file=codes_file,
132                            mask=None,
133                            code_type=self.name,
134                        )
135                    )
136                ),
137            ),
138            (
139                # check codes <5 characters, if too short pads it with . (dots) to reach 5 characters
140                "Too Short",
141                lambda codes: ~(codes.str.len() < 5),
142                lambda codes, codes_file: codes.str.pad(
143                    width=5, side="right", fillchar="."
144                ),
145            ),
146            (
147                # check codes > 5 characters, If too long, truncates them to 5 characters
148                "Too Long",
149                lambda codes: ~(codes.str.len() > 5),
150                lambda codes, codes_file: codes.str[:5],
151            ),
152            (
153                # checks codes contain numbers, or dots (.), if not logs invalid code error
154                "Alphanumeric Dot",
155                lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"),
156                lambda codes, codes_file: self.raise_exception(
157                    InvalidCodesException(
158                        CodesError(
159                            f"Illegal code format, not alphanumeric dot",
160                            codes=codes,
161                            codes_file=codes_file,
162                            mask=codes.str.match(r"^[a-zA-Z0-9.]+$"),
163                            code_type=self.name,
164                        )
165                    )
166                ),
167            ),
168            (
169                # checks code exists in self.db (the Read2 dataset). If missing log invalid codes.
170                "In Database",
171                lambda codes: self.in_database(codes, self.db, self.name),
172                lambda codes, codes_file: self.raise_exception(
173                    InvalidCodesException(
174                        CodesError(
175                            f"Codes do not exist in database",
176                            codes=codes,
177                            codes_file=codes_file,
178                            mask=self.in_database(codes, self.db, self.name),
179                            code_type=self.name,
180                        )
181                    )
182                ),
183            ),
184        ]
185
186
187class Read3(Proto):
188    def __init__(self):
189        super().__init__("Read3", trud.PROCESSED_PATH / "read3.parquet")
190
191        self.checks = [
192            (
193                "Not Empty",
194                lambda codes: pd.Series([len(codes) > 0]),
195                lambda codes, codes_file: self.raise_exception(
196                    InvalidCodesException(
197                        CodesError(
198                            f"Code list is empty",
199                            codes=codes,
200                            codes_file=codes_file,
201                            mask=None,
202                            code_type=self.name,
203                        )
204                    )
205                ),
206            ),
207            (
208                "Too Short",
209                lambda codes: ~(codes.str.len() < 5),
210                lambda codes, codes_file: codes.str.pad(
211                    width=5, side="right", fillchar="."
212                ),
213            ),
214            (
215                "Too Long",
216                lambda codes: ~(codes.str.len() > 5),
217                lambda codes, codes_file: codes.str[:5],
218            ),
219            (
220                "Alphanumeric Dot",
221                lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"),
222                lambda codes, codes_file: self.raise_exception(
223                    InvalidCodesException(
224                        CodesError(
225                            f"QA Alphanumeric Dot",
226                            codes=codes,
227                            codes_file=codes_file,
228                            check_regex=codes.str.match(r"^[a-zA-Z0-9.]+$"),
229                            code_type=self.name,
230                        )
231                    )
232                ),
233            ),
234            (
235                "In Database",
236                lambda codes: self.in_database(codes, self.db, self.name),
237                lambda codes, codes_file: self.raise_exception(
238                    InvalidCodesException(
239                        CodesError(
240                            f"QA In Database",
241                            codes=codes,
242                            codes_file=codes_file,
243                            check_regex=self.in_database(codes, self.db, self.name),
244                            code_type=self.name,
245                        )
246                    )
247                ),
248            ),
249        ]
250
251
252class Icd10(Proto):
253    def __init__(self):
254        super().__init__("icd10", trud.PROCESSED_PATH / "icd10.parquet")
255
256        self.checks = [
257            (
258                "Not Empty",
259                lambda codes: pd.Series([len(codes) > 0]),
260                lambda codes, codes_file: self.raise_exception(
261                    InvalidCodesException(
262                        CodesError(
263                            f"Code list is empty {codes_file}",
264                            codes=codes,
265                            codes_file=codes_file,
266                            mask=None,
267                            code_type=self.name,
268                        )
269                    )
270                ),
271            ),
272            (
273                "Too Short",
274                lambda codes: ~(codes.str.len() < 3),
275                lambda codes, codes_file: self.raise_exception(
276                    InvalidCodesException(
277                        CodesError(
278                            f"QA Too Short",
279                            codes=codes,
280                            codes_file=codes_file,
281                            mask=~(codes.str.len() < 3),
282                            code_type=self.name,
283                        )
284                    )
285                ),
286            ),
287            (
288                "Has Dot",
289                lambda codes: ~(codes.str.match(r".*\..*")),  # check if contains dot
290                lambda codes, codes_file: codes.str.replace(
291                    ".", ""
292                ),  # delete any dots in string
293                # lambda codes : codes.str.split('\.').apply(lambda ls: ls[0]) #only get part before dot
294            ),
295            (
296                "Alphanumeric Capital",
297                lambda codes: codes.str.match(r"^[A-Z0-9]+$"),
298                lambda codes, codes_file: self.raise_exception(
299                    InvalidCodesException(
300                        CodesError(
301                            f"QA Alphanumeric Capital",
302                            codes=codes,
303                            codes_file=codes_file,
304                            mask=codes.str.match(r"^[A-Z0-9]+$"),
305                            code_type=self.name,
306                        )
307                    )
308                ),
309            ),
310            (
311                "In Database",
312                lambda codes: ~(
313                    ~self.in_database(codes, self.db, self.name)
314                    & ~self.in_database(codes, self.db, self.name + "_alt")
315                ),
316                lambda codes, codes_file: self.raise_exception(
317                    InvalidCodesException(
318                        CodesError(
319                            f"QA In Database",
320                            codes=codes,
321                            codes_file=codes_file,
322                            mask=~(
323                                ~self.in_database(codes, self.db, self.name)
324                                & ~self.in_database(codes, self.db, self.name + "_alt")
325                            ),
326                            code_type=self.name,
327                        )
328                    )
329                ),
330            ),
331            # 			(
332            # 				"ICD10 Regex",
333            # 				lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum*
334            # 				lambda codes : lc.log_invalid_code(codes,
335            # 												codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows
336            # 												code_type="icd10",
337            #
338            # 			)
339        ]
340
341    def trim_icd10(codes: pd.DataFrame) -> pd.DataFrame:
342        codes = codes.str[:4]
343        return codes
344
345
346class Snomed(Proto):
347    def __init__(self):
348        super().__init__("snomed", trud.PROCESSED_PATH / "snomed.parquet")
349
350        self.checks = [
351            # (
352            # 	"Not Empty",
353            # 	lambda codes : pd.Series([len(codes) > 0]),
354            # 	lambda codes : raise_exception(Exception("Code List is Empty"))
355            # ),
356            (
357                "Too Short",
358                lambda codes: ~(codes.str.len() < 6),
359                lambda codes, codes_file: self.raise_exception(
360                    InvalidCodesException(
361                        CodesError(
362                            f"QA Too Short",
363                            codes=codes,
364                            codes_file=codes_file,
365                            mask=~(codes.str.len() < 6),
366                            code_type=self.name,
367                        )
368                    )
369                ),
370            ),
371            (
372                "Too Long",
373                lambda codes: ~(codes.str.len() > 18),
374                lambda codes, codes_file: self.raise_exception(
375                    InvalidCodesException(
376                        CodesError(
377                            f"QA Too Long",
378                            codes=codes,
379                            codes_file=codes_file,
380                            mask=~(codes.str.len() > 18),
381                            code_type=self.name,
382                        )
383                    )
384                ),
385            ),
386            (
387                "Numeric",
388                lambda codes: codes.str.match(r"[0-9]+$"),
389                lambda codes, codes_file: self.raise_exception(
390                    InvalidCodesException(
391                        CodesError(
392                            f"QA Numeric",
393                            codes=codes,
394                            codes_file=codes_file,
395                            mask=codes.str.match(r"[0-9]+$"),
396                            code_type=self.name,
397                        )
398                    )
399                ),
400            ),
401            # (
402            # 	"Is Integer",
403            # 	lambda codes : codes.dtype == int,
404            # 	lambda codes : codes.astype(int) #Convert to integer
405            # ),
406            (
407                "In Database",
408                lambda codes: self.in_database(codes, self.db, self.name),
409                lambda codes, codes_file: self.raise_exception(
410                    InvalidCodesException(
411                        CodesError(
412                            f"QA In Database",
413                            codes=codes,
414                            codes_file=codes_file,
415                            mask=self.in_database(codes, self.db, self.name),
416                            code_type=self.name,
417                        )
418                    )
419                ),
420            ),
421        ]
422
423
424class Opcs4(Proto):
425    def __init__(self):
426        super().__init__("opcs4", trud.PROCESSED_PATH / "opcs4.parquet")
427
428        self.checks = [
429            (
430                "Not Empty",
431                lambda codes: pd.Series([len(codes) > 0]),
432                lambda codes, codes_file: self.raise_exception(
433                    InvalidCodesException(
434                        CodesError(
435                            f"Code list is empty",
436                            codes=codes,
437                            codes_file=codes_file,
438                            mask=None,
439                            code_type=self.name,
440                        )
441                    )
442                ),
443            ),
444            (
445                "In Database",
446                lambda codes: self.in_database(codes, self.db, self.name),
447                lambda codes, codes_file: self.raise_exception(
448                    InvalidCodesException(
449                        CodesError(
450                            f"QA In Database",
451                            codes=codes,
452                            codes_file=codes_file,
453                            mask=self.in_database(codes, self.db, self.name),
454                            code_type=self.name,
455                        )
456                    )
457                ),
458            ),
459        ]
460
461
462class Atc(Proto):
463    def __init__(self):
464        super().__init__("atc", trud_codes_path=None)
465        self.checks = [
466            (
467                "Not Empty",
468                lambda codes: pd.Series([len(codes) > 0]),
469                lambda codes, codes_file: self.raise_exception(
470                    InvalidCodesException(
471                        CodesError(
472                            f"Code list is empty",
473                            codes=codes,
474                            codes_file=codes_file,
475                            mask=None,
476                            code_type=self.name,
477                        )
478                    )
479                ),
480            ),
481            (
482                "Alphanumeric Capital",
483                lambda codes: codes.str.match(r"^[A-Z0-9]+$"),
484                lambda codes, codes_file: self.raise_exception(
485                    InvalidCodesException(
486                        CodesError(
487                            f"QA Alphanumeric Capital",
488                            codes=codes,
489                            codes_file=codes_file,
490                            mask=codes.str.match(r"^[A-Z0-9]+$"),
491                            code_type=self.name,
492                        )
493                    )
494                ),
495            ),
496        ]
497
498
499class Med(Proto):
500    def __init__(self):
501        super().__init__("med", trud_codes_path=None)
502        self.checks = [
503            (
504                "Not Empty",
505                lambda codes: pd.Series([len(codes) > 0]),
506                lambda codes, codes_file: self.raise_exception(
507                    InvalidCodesException(
508                        CodesError(
509                            f"Code list is empty",
510                            codes=codes,
511                            codes_file=codes_file,
512                            mask=None,
513                            code_type=self.name,
514                        )
515                    )
516                ),
517            )
518        ]
519
520
521class Cprd(Proto):
522    def __init__(self):
523        super().__init__("cprd", trud_codes_path=None)
524        self.checks = [
525            (
526                "Not Empty",
527                lambda codes: pd.Series([len(codes) > 0]),
528                lambda codes, codes_file: self.raise_exception(
529                    InvalidCodesException(
530                        CodesError(
531                            f"Code list is empty",
532                            codes=codes,
533                            codes_file=codes_file,
534                            mask=None,
535                            code_type=self.name,
536                        )
537                    )
538                ),
539            )
540        ]
541
542
543class CodeTypeParser:
544    """A class used in InvalidCodesException to report an error if a code parser check fails"""
545
546    def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH):
547
548        if not trud_processed_dir.exists() or not trud_processed_dir.is_dir():
549            raise FileNotFoundError(
550                f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install"
551            )
552
553        self.code_types = {
554            "read2": Read2(),
555            "read3": Read3(),
556            "icd10": Icd10(),
557            "snomed": Snomed(),
558            "opcs4": Opcs4(),
559            "atc": Atc(),
560            "med": Med(),
561            "cprd": Cprd(),
562        }
logger = <Logger acmc_logger (INFO)>
SUPPORTED_CODE_TYPES = {'icd10', 'atc', 'read2', 'snomed', 'opcs4', 'read3'}
class CodesError:
17class CodesError:
18    """A class used in InvalidCodesException to report an error if a code parser check fails"""
19
20    def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None):
21        # initialise class variables with provided parameters
22        for key, value in locals().items():
23            if key != "self":
24                setattr(self, key, value)

A class used in InvalidCodesException to report an error if a code parser check fails

CodesError(message, codes=None, codes_file=None, mask=None, code_type=None)
20    def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None):
21        # initialise class variables with provided parameters
22        for key, value in locals().items():
23            if key != "self":
24                setattr(self, key, value)
class InvalidCodesException(builtins.Exception):
27class InvalidCodesException(Exception):
28    """Custom exception class raised when invalid codes are found that cannot be resolved by processing"""
29
30    def __init__(self, error):
31        super().__init__(error.message)
32        self.error = error

Custom exception class raised when invalid codes are found that cannot be resolved by processing

InvalidCodesException(error)
30    def __init__(self, error):
31        super().__init__(error.message)
32        self.error = error
error
class Proto:
 35class Proto:
 36    """
 37    Define checks as list of 3 tuple: (Message, Condition, Process)
 38    - Message = The name of the condition (what is printed and logged)
 39    - Condition = True if Passed, and False if Failed
 40    - Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
 41    """
 42
 43    checks: list[
 44        tuple[
 45            str,  # The description, e.g., "Not Empty"
 46            Callable[
 47                [list],
 48                pd.Series,
 49            ],  # The first lambda function: takes a list and returns a pd.Series of booleans
 50            Callable[
 51                [list, Path],
 52                None,
 53            ],  # The second lambda function: takes a list and a string, and returns nothing
 54        ]
 55    ]
 56
 57    def __init__(self, name: str, trud_codes_path: Optional[Path] = None):
 58        if trud_codes_path is not None:
 59            if trud_codes_path.is_file():
 60                self.trud_codes_path: Path = trud_codes_path
 61                self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path)
 62            else:
 63                raise FileNotFoundError(
 64                    f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly"
 65                )
 66
 67        self.name: str = name
 68
 69    def raise_exception(self, ex: Exception):
 70        """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict"""
 71        raise ex
 72
 73    def in_database(
 74        self, codes: pd.DataFrame, db: pd.DataFrame, col: str
 75    ) -> pd.DataFrame:
 76        return codes.isin(db[col])
 77
 78    def process(
 79        self, codes: pd.DataFrame, codes_file: Path
 80    ) -> Tuple[pd.DataFrame, list]:
 81        """identify issues that do not pass and fix them with define/d process"""
 82        errors = []
 83        # Iter through each item in check.
 84        for msg, cond, fix in self.checks:
 85            # Check if any codes fail the check to False
 86            if not cond(codes).all():
 87                # Log the number of codes that failed
 88                logger.debug(
 89                    f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix"
 90                )
 91                # try fix errors by running lamba "process" function
 92                try:
 93                    codes = fix(codes, codes_file)
 94                    logger.debug(f"Check: Fixed")
 95                except InvalidCodesException as ex:
 96                    errors.append(ex.error)
 97            else:
 98                logger.debug(f"Check: passed")
 99
100        return codes, errors
101
102    def verify(self, codes: pd.DataFrame, codes_file: Path):
103        """verify codes in codes file"""
104        conds = np.array([])
105
106        # Iter through each item in check.
107        for msg, cond, process in self.checks:
108            # run conditional check
109            out = cond(codes)
110            conds = np.append(conds, out.all())
111
112        return conds

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Proto(name: str, trud_codes_path: Optional[pathlib.Path] = None)
57    def __init__(self, name: str, trud_codes_path: Optional[Path] = None):
58        if trud_codes_path is not None:
59            if trud_codes_path.is_file():
60                self.trud_codes_path: Path = trud_codes_path
61                self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path)
62            else:
63                raise FileNotFoundError(
64                    f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly"
65                )
66
67        self.name: str = name
checks: list[tuple[str, typing.Callable[[list], pandas.core.series.Series], typing.Callable[[list, pathlib.Path], NoneType]]]
name: str
def raise_exception(self, ex: Exception):
69    def raise_exception(self, ex: Exception):
70        """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict"""
71        raise ex

Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict

def in_database( self, codes: pandas.core.frame.DataFrame, db: pandas.core.frame.DataFrame, col: str) -> pandas.core.frame.DataFrame:
73    def in_database(
74        self, codes: pd.DataFrame, db: pd.DataFrame, col: str
75    ) -> pd.DataFrame:
76        return codes.isin(db[col])
def process( self, codes: pandas.core.frame.DataFrame, codes_file: pathlib.Path) -> Tuple[pandas.core.frame.DataFrame, list]:
 78    def process(
 79        self, codes: pd.DataFrame, codes_file: Path
 80    ) -> Tuple[pd.DataFrame, list]:
 81        """identify issues that do not pass and fix them with define/d process"""
 82        errors = []
 83        # Iter through each item in check.
 84        for msg, cond, fix in self.checks:
 85            # Check if any codes fail the check to False
 86            if not cond(codes).all():
 87                # Log the number of codes that failed
 88                logger.debug(
 89                    f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix"
 90                )
 91                # try fix errors by running lamba "process" function
 92                try:
 93                    codes = fix(codes, codes_file)
 94                    logger.debug(f"Check: Fixed")
 95                except InvalidCodesException as ex:
 96                    errors.append(ex.error)
 97            else:
 98                logger.debug(f"Check: passed")
 99
100        return codes, errors

identify issues that do not pass and fix them with define/d process

def verify(self, codes: pandas.core.frame.DataFrame, codes_file: pathlib.Path):
102    def verify(self, codes: pd.DataFrame, codes_file: Path):
103        """verify codes in codes file"""
104        conds = np.array([])
105
106        # Iter through each item in check.
107        for msg, cond, process in self.checks:
108            # run conditional check
109            out = cond(codes)
110            conds = np.append(conds, out.all())
111
112        return conds

verify codes in codes file

class Read2(Proto):
115class Read2(Proto):
116    """This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary."""
117
118    def __init__(self):
119        super().__init__("read2", trud.PROCESSED_PATH / "read2.parquet")
120
121        # validate checks
122        self.checks = [
123            (
124                # check codes are not empty, if empty throw an exception
125                "Not Empty",
126                lambda codes: pd.Series([len(codes) > 0]),
127                lambda codes, codes_file: self.raise_exception(
128                    InvalidCodesException(
129                        CodesError(
130                            f"Code list is empty",
131                            codes=codes,
132                            codes_file=codes_file,
133                            mask=None,
134                            code_type=self.name,
135                        )
136                    )
137                ),
138            ),
139            (
140                # check codes <5 characters, if too short pads it with . (dots) to reach 5 characters
141                "Too Short",
142                lambda codes: ~(codes.str.len() < 5),
143                lambda codes, codes_file: codes.str.pad(
144                    width=5, side="right", fillchar="."
145                ),
146            ),
147            (
148                # check codes > 5 characters, If too long, truncates them to 5 characters
149                "Too Long",
150                lambda codes: ~(codes.str.len() > 5),
151                lambda codes, codes_file: codes.str[:5],
152            ),
153            (
154                # checks codes contain numbers, or dots (.), if not logs invalid code error
155                "Alphanumeric Dot",
156                lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"),
157                lambda codes, codes_file: self.raise_exception(
158                    InvalidCodesException(
159                        CodesError(
160                            f"Illegal code format, not alphanumeric dot",
161                            codes=codes,
162                            codes_file=codes_file,
163                            mask=codes.str.match(r"^[a-zA-Z0-9.]+$"),
164                            code_type=self.name,
165                        )
166                    )
167                ),
168            ),
169            (
170                # checks code exists in self.db (the Read2 dataset). If missing log invalid codes.
171                "In Database",
172                lambda codes: self.in_database(codes, self.db, self.name),
173                lambda codes, codes_file: self.raise_exception(
174                    InvalidCodesException(
175                        CodesError(
176                            f"Codes do not exist in database",
177                            codes=codes,
178                            codes_file=codes_file,
179                            mask=self.in_database(codes, self.db, self.name),
180                            code_type=self.name,
181                        )
182                    )
183                ),
184            ),
185        ]

This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary.

checks
class Read3(Proto):
188class Read3(Proto):
189    def __init__(self):
190        super().__init__("Read3", trud.PROCESSED_PATH / "read3.parquet")
191
192        self.checks = [
193            (
194                "Not Empty",
195                lambda codes: pd.Series([len(codes) > 0]),
196                lambda codes, codes_file: self.raise_exception(
197                    InvalidCodesException(
198                        CodesError(
199                            f"Code list is empty",
200                            codes=codes,
201                            codes_file=codes_file,
202                            mask=None,
203                            code_type=self.name,
204                        )
205                    )
206                ),
207            ),
208            (
209                "Too Short",
210                lambda codes: ~(codes.str.len() < 5),
211                lambda codes, codes_file: codes.str.pad(
212                    width=5, side="right", fillchar="."
213                ),
214            ),
215            (
216                "Too Long",
217                lambda codes: ~(codes.str.len() > 5),
218                lambda codes, codes_file: codes.str[:5],
219            ),
220            (
221                "Alphanumeric Dot",
222                lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"),
223                lambda codes, codes_file: self.raise_exception(
224                    InvalidCodesException(
225                        CodesError(
226                            f"QA Alphanumeric Dot",
227                            codes=codes,
228                            codes_file=codes_file,
229                            check_regex=codes.str.match(r"^[a-zA-Z0-9.]+$"),
230                            code_type=self.name,
231                        )
232                    )
233                ),
234            ),
235            (
236                "In Database",
237                lambda codes: self.in_database(codes, self.db, self.name),
238                lambda codes, codes_file: self.raise_exception(
239                    InvalidCodesException(
240                        CodesError(
241                            f"QA In Database",
242                            codes=codes,
243                            codes_file=codes_file,
244                            check_regex=self.in_database(codes, self.db, self.name),
245                            code_type=self.name,
246                        )
247                    )
248                ),
249            ),
250        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Icd10(Proto):
253class Icd10(Proto):
254    def __init__(self):
255        super().__init__("icd10", trud.PROCESSED_PATH / "icd10.parquet")
256
257        self.checks = [
258            (
259                "Not Empty",
260                lambda codes: pd.Series([len(codes) > 0]),
261                lambda codes, codes_file: self.raise_exception(
262                    InvalidCodesException(
263                        CodesError(
264                            f"Code list is empty {codes_file}",
265                            codes=codes,
266                            codes_file=codes_file,
267                            mask=None,
268                            code_type=self.name,
269                        )
270                    )
271                ),
272            ),
273            (
274                "Too Short",
275                lambda codes: ~(codes.str.len() < 3),
276                lambda codes, codes_file: self.raise_exception(
277                    InvalidCodesException(
278                        CodesError(
279                            f"QA Too Short",
280                            codes=codes,
281                            codes_file=codes_file,
282                            mask=~(codes.str.len() < 3),
283                            code_type=self.name,
284                        )
285                    )
286                ),
287            ),
288            (
289                "Has Dot",
290                lambda codes: ~(codes.str.match(r".*\..*")),  # check if contains dot
291                lambda codes, codes_file: codes.str.replace(
292                    ".", ""
293                ),  # delete any dots in string
294                # lambda codes : codes.str.split('\.').apply(lambda ls: ls[0]) #only get part before dot
295            ),
296            (
297                "Alphanumeric Capital",
298                lambda codes: codes.str.match(r"^[A-Z0-9]+$"),
299                lambda codes, codes_file: self.raise_exception(
300                    InvalidCodesException(
301                        CodesError(
302                            f"QA Alphanumeric Capital",
303                            codes=codes,
304                            codes_file=codes_file,
305                            mask=codes.str.match(r"^[A-Z0-9]+$"),
306                            code_type=self.name,
307                        )
308                    )
309                ),
310            ),
311            (
312                "In Database",
313                lambda codes: ~(
314                    ~self.in_database(codes, self.db, self.name)
315                    & ~self.in_database(codes, self.db, self.name + "_alt")
316                ),
317                lambda codes, codes_file: self.raise_exception(
318                    InvalidCodesException(
319                        CodesError(
320                            f"QA In Database",
321                            codes=codes,
322                            codes_file=codes_file,
323                            mask=~(
324                                ~self.in_database(codes, self.db, self.name)
325                                & ~self.in_database(codes, self.db, self.name + "_alt")
326                            ),
327                            code_type=self.name,
328                        )
329                    )
330                ),
331            ),
332            # 			(
333            # 				"ICD10 Regex",
334            # 				lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum*
335            # 				lambda codes : lc.log_invalid_code(codes,
336            # 												codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows
337            # 												code_type="icd10",
338            #
339            # 			)
340        ]
341
342    def trim_icd10(codes: pd.DataFrame) -> pd.DataFrame:
343        codes = codes.str[:4]
344        return codes

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
def trim_icd10(codes: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame:
342    def trim_icd10(codes: pd.DataFrame) -> pd.DataFrame:
343        codes = codes.str[:4]
344        return codes
class Snomed(Proto):
347class Snomed(Proto):
348    def __init__(self):
349        super().__init__("snomed", trud.PROCESSED_PATH / "snomed.parquet")
350
351        self.checks = [
352            # (
353            # 	"Not Empty",
354            # 	lambda codes : pd.Series([len(codes) > 0]),
355            # 	lambda codes : raise_exception(Exception("Code List is Empty"))
356            # ),
357            (
358                "Too Short",
359                lambda codes: ~(codes.str.len() < 6),
360                lambda codes, codes_file: self.raise_exception(
361                    InvalidCodesException(
362                        CodesError(
363                            f"QA Too Short",
364                            codes=codes,
365                            codes_file=codes_file,
366                            mask=~(codes.str.len() < 6),
367                            code_type=self.name,
368                        )
369                    )
370                ),
371            ),
372            (
373                "Too Long",
374                lambda codes: ~(codes.str.len() > 18),
375                lambda codes, codes_file: self.raise_exception(
376                    InvalidCodesException(
377                        CodesError(
378                            f"QA Too Long",
379                            codes=codes,
380                            codes_file=codes_file,
381                            mask=~(codes.str.len() > 18),
382                            code_type=self.name,
383                        )
384                    )
385                ),
386            ),
387            (
388                "Numeric",
389                lambda codes: codes.str.match(r"[0-9]+$"),
390                lambda codes, codes_file: self.raise_exception(
391                    InvalidCodesException(
392                        CodesError(
393                            f"QA Numeric",
394                            codes=codes,
395                            codes_file=codes_file,
396                            mask=codes.str.match(r"[0-9]+$"),
397                            code_type=self.name,
398                        )
399                    )
400                ),
401            ),
402            # (
403            # 	"Is Integer",
404            # 	lambda codes : codes.dtype == int,
405            # 	lambda codes : codes.astype(int) #Convert to integer
406            # ),
407            (
408                "In Database",
409                lambda codes: self.in_database(codes, self.db, self.name),
410                lambda codes, codes_file: self.raise_exception(
411                    InvalidCodesException(
412                        CodesError(
413                            f"QA In Database",
414                            codes=codes,
415                            codes_file=codes_file,
416                            mask=self.in_database(codes, self.db, self.name),
417                            code_type=self.name,
418                        )
419                    )
420                ),
421            ),
422        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Opcs4(Proto):
425class Opcs4(Proto):
426    def __init__(self):
427        super().__init__("opcs4", trud.PROCESSED_PATH / "opcs4.parquet")
428
429        self.checks = [
430            (
431                "Not Empty",
432                lambda codes: pd.Series([len(codes) > 0]),
433                lambda codes, codes_file: self.raise_exception(
434                    InvalidCodesException(
435                        CodesError(
436                            f"Code list is empty",
437                            codes=codes,
438                            codes_file=codes_file,
439                            mask=None,
440                            code_type=self.name,
441                        )
442                    )
443                ),
444            ),
445            (
446                "In Database",
447                lambda codes: self.in_database(codes, self.db, self.name),
448                lambda codes, codes_file: self.raise_exception(
449                    InvalidCodesException(
450                        CodesError(
451                            f"QA In Database",
452                            codes=codes,
453                            codes_file=codes_file,
454                            mask=self.in_database(codes, self.db, self.name),
455                            code_type=self.name,
456                        )
457                    )
458                ),
459            ),
460        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Atc(Proto):
463class Atc(Proto):
464    def __init__(self):
465        super().__init__("atc", trud_codes_path=None)
466        self.checks = [
467            (
468                "Not Empty",
469                lambda codes: pd.Series([len(codes) > 0]),
470                lambda codes, codes_file: self.raise_exception(
471                    InvalidCodesException(
472                        CodesError(
473                            f"Code list is empty",
474                            codes=codes,
475                            codes_file=codes_file,
476                            mask=None,
477                            code_type=self.name,
478                        )
479                    )
480                ),
481            ),
482            (
483                "Alphanumeric Capital",
484                lambda codes: codes.str.match(r"^[A-Z0-9]+$"),
485                lambda codes, codes_file: self.raise_exception(
486                    InvalidCodesException(
487                        CodesError(
488                            f"QA Alphanumeric Capital",
489                            codes=codes,
490                            codes_file=codes_file,
491                            mask=codes.str.match(r"^[A-Z0-9]+$"),
492                            code_type=self.name,
493                        )
494                    )
495                ),
496            ),
497        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Med(Proto):
500class Med(Proto):
501    def __init__(self):
502        super().__init__("med", trud_codes_path=None)
503        self.checks = [
504            (
505                "Not Empty",
506                lambda codes: pd.Series([len(codes) > 0]),
507                lambda codes, codes_file: self.raise_exception(
508                    InvalidCodesException(
509                        CodesError(
510                            f"Code list is empty",
511                            codes=codes,
512                            codes_file=codes_file,
513                            mask=None,
514                            code_type=self.name,
515                        )
516                    )
517                ),
518            )
519        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Cprd(Proto):
522class Cprd(Proto):
523    def __init__(self):
524        super().__init__("cprd", trud_codes_path=None)
525        self.checks = [
526            (
527                "Not Empty",
528                lambda codes: pd.Series([len(codes) > 0]),
529                lambda codes, codes_file: self.raise_exception(
530                    InvalidCodesException(
531                        CodesError(
532                            f"Code list is empty",
533                            codes=codes,
534                            codes_file=codes_file,
535                            mask=None,
536                            code_type=self.name,
537                        )
538                    )
539                ),
540            )
541        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class CodeTypeParser:
544class CodeTypeParser:
545    """A class used in InvalidCodesException to report an error if a code parser check fails"""
546
547    def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH):
548
549        if not trud_processed_dir.exists() or not trud_processed_dir.is_dir():
550            raise FileNotFoundError(
551                f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install"
552            )
553
554        self.code_types = {
555            "read2": Read2(),
556            "read3": Read3(),
557            "icd10": Icd10(),
558            "snomed": Snomed(),
559            "opcs4": Opcs4(),
560            "atc": Atc(),
561            "med": Med(),
562            "cprd": Cprd(),
563        }

A class used in InvalidCodesException to report an error if a code parser check fails

CodeTypeParser(trud_processed_dir: pathlib.Path = PosixPath('vocab/trud/processed'))
547    def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH):
548
549        if not trud_processed_dir.exists() or not trud_processed_dir.is_dir():
550            raise FileNotFoundError(
551                f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install"
552            )
553
554        self.code_types = {
555            "read2": Read2(),
556            "read3": Read3(),
557            "icd10": Icd10(),
558            "snomed": Snomed(),
559            "opcs4": Opcs4(),
560            "atc": Atc(),
561            "med": Med(),
562            "cprd": Cprd(),
563        }
code_types