acmc.parse

  1import pandas as pd
  2import numpy as np
  3import os
  4from typing import Callable, Optional, Tuple
  5from pathlib import Path
  6
  7from acmc import trud, logging_config as lc
  8
  9# setup logging
 10logger = lc.setup_logger()
 11
 12# Define allowed values
 13SUPPORTED_CODE_TYPES = {"read2", "read3", "icd10", "snomed", "opcs4", "atc"}
 14
 15
 16class CodesError:
 17    """A class used in InvalidCodesException to report an error if a code parser check fails"""
 18
 19    def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None):
 20        # initialise class variables with provided parameters
 21        for key, value in locals().items():
 22            if key != "self":
 23                setattr(self, key, value)
 24
 25
 26class InvalidCodesException(Exception):
 27    """Custom exception class raised when invalid codes are found that cannot be resolved by processing"""
 28
 29    def __init__(self, error):
 30        super().__init__(error.message)
 31        self.error = error
 32
 33
 34class Proto:
 35    """
 36    Define checks as list of 3 tuple: (Message, Condition, Process)
 37    - Message = The name of the condition (what is printed and logged)
 38    - Condition = True if Passed, and False if Failed
 39    - Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
 40    """
 41
 42    checks: list[
 43        tuple[
 44            str,  # The description, e.g., "Not Empty"
 45            Callable[
 46                [pd.DataFrame],
 47                pd.Series,
 48            ],  # The first lambda function: takes a list and returns a pd.Series of booleans
 49            Callable[
 50                [pd.DataFrame, Path],
 51                pd.DataFrame,
 52            ],  # The second lambda function: takes a list and a string, and returns nothing
 53        ]
 54    ]
 55
 56    def __init__(self, name: str, trud_codes_path: Optional[Path] = None):
 57        if trud_codes_path is not None:
 58            if trud_codes_path.is_file():
 59                self.trud_codes_path: Path = trud_codes_path
 60                self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path)
 61            else:
 62                raise FileNotFoundError(
 63                    f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly"
 64                )
 65
 66        self.name: str = name
 67
 68    def raise_exception(self, ex: Exception):
 69        """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict"""
 70        raise ex
 71
 72    def in_database(
 73        self, codes: pd.DataFrame, db: pd.DataFrame, col: str
 74    ) -> pd.DataFrame:
 75        return codes.isin(db[col])
 76
 77    def process(
 78        self, codes: pd.DataFrame, codes_file: Path
 79    ) -> Tuple[pd.DataFrame, list]:
 80        """identify issues that do not pass and fix them with define/d process"""
 81        errors = []
 82        # Iter through each item in check.
 83        for msg, cond, fix in self.checks:
 84            # Check if any codes fail the check to False
 85            if not cond(codes).all():
 86                # Log the number of codes that failed
 87                logger.debug(
 88                    f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix"
 89                )
 90                # try fix errors by running lamba "process" function
 91                try:
 92                    codes = fix(codes, codes_file)
 93                    logger.debug(f"Check: Fixed")
 94                except InvalidCodesException as ex:
 95                    errors.append(ex.error)
 96            else:
 97                logger.debug(f"Check: passed")
 98
 99        return codes, errors
100
101    def verify(self, codes: pd.DataFrame, codes_file: Path):
102        """verify codes in codes file"""
103        conds = np.array([])
104
105        # Iter through each item in check.
106        for msg, cond, process in self.checks:
107            # run conditional check
108            out = cond(codes)
109            conds = np.append(conds, out.all())
110
111        return conds
112
113
114class Read2(Proto):
115    """This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary."""
116
117    def __init__(self):
118        super().__init__("read2", trud.PROCESSED_PATH / "read2.parquet")
119
120        # validate checks
121        self.checks = [
122            (
123                # check codes are not empty, if empty throw an exception
124                "Not Empty",
125                lambda codes: pd.Series([len(codes) > 0]),
126                lambda codes, codes_file: self.raise_exception(
127                    InvalidCodesException(
128                        CodesError(
129                            f"Code list is empty",
130                            codes=codes,
131                            codes_file=codes_file,
132                            mask=None,
133                            code_type=self.name,
134                        )
135                    )
136                ),
137            ),
138            (
139                # check codes <5 characters, if too short pads it with . (dots) to reach 5 characters
140                "Too Short",
141                lambda codes: ~(codes.str.len() < 5),
142                lambda codes, codes_file: codes.str.pad(
143                    width=5, side="right", fillchar="."
144                ),
145            ),
146            (
147                # check codes > 5 characters, If too long, truncates them to 5 characters
148                "Too Long",
149                lambda codes: ~(codes.str.len() > 5),
150                lambda codes, codes_file: codes.str[:5],
151            ),
152            (
153                # checks codes contain numbers, or dots (.), if not logs invalid code error
154                "Alphanumeric Dot",
155                lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"),
156                lambda codes, codes_file: self.raise_exception(
157                    InvalidCodesException(
158                        CodesError(
159                            f"Illegal code format, not alphanumeric dot",
160                            codes=codes,
161                            codes_file=codes_file,
162                            mask=codes.str.match(r"^[a-zA-Z0-9.]+$"),
163                            code_type=self.name,
164                        )
165                    )
166                ),
167            ),
168            (
169                # checks code exists in self.db (the Read2 dataset). If missing log invalid codes.
170                "In Database",
171                lambda codes: self.in_database(codes, self.db, self.name),
172                lambda codes, codes_file: self.raise_exception(
173                    InvalidCodesException(
174                        CodesError(
175                            f"Codes do not exist in database",
176                            codes=codes,
177                            codes_file=codes_file,
178                            mask=self.in_database(codes, self.db, self.name),
179                            code_type=self.name,
180                        )
181                    )
182                ),
183            ),
184        ]
185
186
187class Read3(Proto):
188    def __init__(self):
189        super().__init__("Read3", trud.PROCESSED_PATH / "read3.parquet")
190
191        self.checks = [
192            (
193                "Not Empty",
194                lambda codes: pd.Series([len(codes) > 0]),
195                lambda codes, codes_file: self.raise_exception(
196                    InvalidCodesException(
197                        CodesError(
198                            f"Code list is empty",
199                            codes=codes,
200                            codes_file=codes_file,
201                            mask=None,
202                            code_type=self.name,
203                        )
204                    )
205                ),
206            ),
207            (
208                "Too Short",
209                lambda codes: ~(codes.str.len() < 5),
210                lambda codes, codes_file: codes.str.pad(
211                    width=5, side="right", fillchar="."
212                ),
213            ),
214            (
215                "Too Long",
216                lambda codes: ~(codes.str.len() > 5),
217                lambda codes, codes_file: codes.str[:5],
218            ),
219            (
220                "Alphanumeric Dot",
221                lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"),
222                lambda codes, codes_file: self.raise_exception(
223                    InvalidCodesException(
224                        CodesError(
225                            f"QA Alphanumeric Dot",
226                            codes=codes,
227                            codes_file=codes_file,
228                            check_regex=codes.str.match(r"^[a-zA-Z0-9.]+$"),
229                            code_type=self.name,
230                        )
231                    )
232                ),
233            ),
234            (
235                "In Database",
236                lambda codes: self.in_database(codes, self.db, self.name),
237                lambda codes, codes_file: self.raise_exception(
238                    InvalidCodesException(
239                        CodesError(
240                            f"QA In Database",
241                            codes=codes,
242                            codes_file=codes_file,
243                            check_regex=self.in_database(codes, self.db, self.name),
244                            code_type=self.name,
245                        )
246                    )
247                ),
248            ),
249        ]
250
251
252class Icd10(Proto):
253    def __init__(self):
254        super().__init__("icd10", trud.PROCESSED_PATH / "icd10.parquet")
255
256        self.checks = [
257            (
258                "Not Empty",
259                lambda codes: pd.Series([len(codes) > 0]),
260                lambda codes, codes_file: self.raise_exception(
261                    InvalidCodesException(
262                        CodesError(
263                            f"Code list is empty {codes_file}",
264                            codes=codes,
265                            codes_file=codes_file,
266                            mask=None,
267                            code_type=self.name,
268                        )
269                    )
270                ),
271            ),
272            (
273                "Too Short",
274                lambda codes: ~(codes.str.len() < 3),
275                lambda codes, codes_file: self.raise_exception(
276                    InvalidCodesException(
277                        CodesError(
278                            f"QA Too Short",
279                            codes=codes,
280                            codes_file=codes_file,
281                            mask=~(codes.str.len() < 3),
282                            code_type=self.name,
283                        )
284                    )
285                ),
286            ),
287            (
288                "Has Dot",
289                lambda codes: ~(codes.str.match(r".*\..*")),  # check if contains dot
290                lambda codes, codes_file: codes.str.replace(
291                    ".", ""
292                ),  # delete any dots in string
293                # lambda codes : codes.str.split('\.').apply(lambda ls: ls[0]) #only get part before dot
294            ),
295            (
296                "Alphanumeric Capital",
297                lambda codes: codes.str.match(r"^[A-Z0-9]+$"),
298                lambda codes, codes_file: self.raise_exception(
299                    InvalidCodesException(
300                        CodesError(
301                            f"QA Alphanumeric Capital",
302                            codes=codes,
303                            codes_file=codes_file,
304                            mask=codes.str.match(r"^[A-Z0-9]+$"),
305                            code_type=self.name,
306                        )
307                    )
308                ),
309            ),
310            (
311                "In Database",
312                lambda codes: ~(
313                    ~self.in_database(codes, self.db, self.name)
314                    & ~self.in_database(codes, self.db, self.name + "_alt")
315                ),
316                lambda codes, codes_file: self.raise_exception(
317                    InvalidCodesException(
318                        CodesError(
319                            f"QA In Database",
320                            codes=codes,
321                            codes_file=codes_file,
322                            mask=~(
323                                ~self.in_database(codes, self.db, self.name)
324                                & ~self.in_database(codes, self.db, self.name + "_alt")
325                            ),
326                            code_type=self.name,
327                        )
328                    )
329                ),
330            ),
331            # 			(
332            # 				"ICD10 Regex",
333            # 				lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum*
334            # 				lambda codes : lc.log_invalid_code(codes,
335            # 												codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows
336            # 												code_type="icd10",
337            #
338            # 			)
339        ]
340
341
342class Snomed(Proto):
343    def __init__(self):
344        super().__init__("snomed", trud.PROCESSED_PATH / "snomed.parquet")
345
346        self.checks = [
347            # (
348            # 	"Not Empty",
349            # 	lambda codes : pd.Series([len(codes) > 0]),
350            # 	lambda codes : raise_exception(Exception("Code List is Empty"))
351            # ),
352            (
353                "Too Short",
354                lambda codes: ~(codes.str.len() < 6),
355                lambda codes, codes_file: self.raise_exception(
356                    InvalidCodesException(
357                        CodesError(
358                            f"QA Too Short",
359                            codes=codes,
360                            codes_file=codes_file,
361                            mask=~(codes.str.len() < 6),
362                            code_type=self.name,
363                        )
364                    )
365                ),
366            ),
367            (
368                "Too Long",
369                lambda codes: ~(codes.str.len() > 18),
370                lambda codes, codes_file: self.raise_exception(
371                    InvalidCodesException(
372                        CodesError(
373                            f"QA Too Long",
374                            codes=codes,
375                            codes_file=codes_file,
376                            mask=~(codes.str.len() > 18),
377                            code_type=self.name,
378                        )
379                    )
380                ),
381            ),
382            (
383                "Numeric",
384                lambda codes: codes.str.match(r"[0-9]+$"),
385                lambda codes, codes_file: self.raise_exception(
386                    InvalidCodesException(
387                        CodesError(
388                            f"QA Numeric",
389                            codes=codes,
390                            codes_file=codes_file,
391                            mask=codes.str.match(r"[0-9]+$"),
392                            code_type=self.name,
393                        )
394                    )
395                ),
396            ),
397            # (
398            # 	"Is Integer",
399            # 	lambda codes : codes.dtype == int,
400            # 	lambda codes : codes.astype(int) #Convert to integer
401            # ),
402            (
403                "In Database",
404                lambda codes: self.in_database(codes, self.db, self.name),
405                lambda codes, codes_file: self.raise_exception(
406                    InvalidCodesException(
407                        CodesError(
408                            f"QA In Database",
409                            codes=codes,
410                            codes_file=codes_file,
411                            mask=self.in_database(codes, self.db, self.name),
412                            code_type=self.name,
413                        )
414                    )
415                ),
416            ),
417        ]
418
419
420class Opcs4(Proto):
421    def __init__(self):
422        super().__init__("opcs4", trud.PROCESSED_PATH / "opcs4.parquet")
423
424        self.checks = [
425            (
426                "Not Empty",
427                lambda codes: pd.Series([len(codes) > 0]),
428                lambda codes, codes_file: self.raise_exception(
429                    InvalidCodesException(
430                        CodesError(
431                            f"Code list is empty",
432                            codes=codes,
433                            codes_file=codes_file,
434                            mask=None,
435                            code_type=self.name,
436                        )
437                    )
438                ),
439            ),
440            (
441                "In Database",
442                lambda codes: self.in_database(codes, self.db, self.name),
443                lambda codes, codes_file: self.raise_exception(
444                    InvalidCodesException(
445                        CodesError(
446                            f"QA In Database",
447                            codes=codes,
448                            codes_file=codes_file,
449                            mask=self.in_database(codes, self.db, self.name),
450                            code_type=self.name,
451                        )
452                    )
453                ),
454            ),
455        ]
456
457
458class Atc(Proto):
459    def __init__(self):
460        super().__init__("atc", trud_codes_path=None)
461        self.checks = [
462            (
463                "Not Empty",
464                lambda codes: pd.Series([len(codes) > 0]),
465                lambda codes, codes_file: self.raise_exception(
466                    InvalidCodesException(
467                        CodesError(
468                            f"Code list is empty",
469                            codes=codes,
470                            codes_file=codes_file,
471                            mask=None,
472                            code_type=self.name,
473                        )
474                    )
475                ),
476            ),
477            (
478                "Alphanumeric Capital",
479                lambda codes: codes.str.match(r"^[A-Z0-9]+$"),
480                lambda codes, codes_file: self.raise_exception(
481                    InvalidCodesException(
482                        CodesError(
483                            f"QA Alphanumeric Capital",
484                            codes=codes,
485                            codes_file=codes_file,
486                            mask=codes.str.match(r"^[A-Z0-9]+$"),
487                            code_type=self.name,
488                        )
489                    )
490                ),
491            ),
492        ]
493
494
495class Med(Proto):
496    def __init__(self):
497        super().__init__("med", trud_codes_path=None)
498        self.checks = [
499            (
500                "Not Empty",
501                lambda codes: pd.Series([len(codes) > 0]),
502                lambda codes, codes_file: self.raise_exception(
503                    InvalidCodesException(
504                        CodesError(
505                            f"Code list is empty",
506                            codes=codes,
507                            codes_file=codes_file,
508                            mask=None,
509                            code_type=self.name,
510                        )
511                    )
512                ),
513            )
514        ]
515
516
517class Cprd(Proto):
518    def __init__(self):
519        super().__init__("cprd", trud_codes_path=None)
520        self.checks = [
521            (
522                "Not Empty",
523                lambda codes: pd.Series([len(codes) > 0]),
524                lambda codes, codes_file: self.raise_exception(
525                    InvalidCodesException(
526                        CodesError(
527                            f"Code list is empty",
528                            codes=codes,
529                            codes_file=codes_file,
530                            mask=None,
531                            code_type=self.name,
532                        )
533                    )
534                ),
535            )
536        ]
537
538
539class CodeTypeParser:
540    """A class used in InvalidCodesException to report an error if a code parser check fails"""
541
542    def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH):
543        if not trud_processed_dir.exists() or not trud_processed_dir.is_dir():
544            raise FileNotFoundError(
545                f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install"
546            )
547
548        self.code_types = {
549            "read2": Read2(),
550            "read3": Read3(),
551            "icd10": Icd10(),
552            "snomed": Snomed(),
553            "opcs4": Opcs4(),
554            "atc": Atc(),
555            "med": Med(),
556            "cprd": Cprd(),
557        }
logger = <Logger acmc_logger (INFO)>
SUPPORTED_CODE_TYPES = {'read2', 'icd10', 'atc', 'snomed', 'read3', 'opcs4'}
class CodesError:
17class CodesError:
18    """A class used in InvalidCodesException to report an error if a code parser check fails"""
19
20    def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None):
21        # initialise class variables with provided parameters
22        for key, value in locals().items():
23            if key != "self":
24                setattr(self, key, value)

A class used in InvalidCodesException to report an error if a code parser check fails

CodesError(message, codes=None, codes_file=None, mask=None, code_type=None)
20    def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None):
21        # initialise class variables with provided parameters
22        for key, value in locals().items():
23            if key != "self":
24                setattr(self, key, value)
class InvalidCodesException(builtins.Exception):
27class InvalidCodesException(Exception):
28    """Custom exception class raised when invalid codes are found that cannot be resolved by processing"""
29
30    def __init__(self, error):
31        super().__init__(error.message)
32        self.error = error

Custom exception class raised when invalid codes are found that cannot be resolved by processing

InvalidCodesException(error)
30    def __init__(self, error):
31        super().__init__(error.message)
32        self.error = error
error
class Proto:
 35class Proto:
 36    """
 37    Define checks as list of 3 tuple: (Message, Condition, Process)
 38    - Message = The name of the condition (what is printed and logged)
 39    - Condition = True if Passed, and False if Failed
 40    - Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
 41    """
 42
 43    checks: list[
 44        tuple[
 45            str,  # The description, e.g., "Not Empty"
 46            Callable[
 47                [pd.DataFrame],
 48                pd.Series,
 49            ],  # The first lambda function: takes a list and returns a pd.Series of booleans
 50            Callable[
 51                [pd.DataFrame, Path],
 52                pd.DataFrame,
 53            ],  # The second lambda function: takes a list and a string, and returns nothing
 54        ]
 55    ]
 56
 57    def __init__(self, name: str, trud_codes_path: Optional[Path] = None):
 58        if trud_codes_path is not None:
 59            if trud_codes_path.is_file():
 60                self.trud_codes_path: Path = trud_codes_path
 61                self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path)
 62            else:
 63                raise FileNotFoundError(
 64                    f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly"
 65                )
 66
 67        self.name: str = name
 68
 69    def raise_exception(self, ex: Exception):
 70        """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict"""
 71        raise ex
 72
 73    def in_database(
 74        self, codes: pd.DataFrame, db: pd.DataFrame, col: str
 75    ) -> pd.DataFrame:
 76        return codes.isin(db[col])
 77
 78    def process(
 79        self, codes: pd.DataFrame, codes_file: Path
 80    ) -> Tuple[pd.DataFrame, list]:
 81        """identify issues that do not pass and fix them with define/d process"""
 82        errors = []
 83        # Iter through each item in check.
 84        for msg, cond, fix in self.checks:
 85            # Check if any codes fail the check to False
 86            if not cond(codes).all():
 87                # Log the number of codes that failed
 88                logger.debug(
 89                    f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix"
 90                )
 91                # try fix errors by running lamba "process" function
 92                try:
 93                    codes = fix(codes, codes_file)
 94                    logger.debug(f"Check: Fixed")
 95                except InvalidCodesException as ex:
 96                    errors.append(ex.error)
 97            else:
 98                logger.debug(f"Check: passed")
 99
100        return codes, errors
101
102    def verify(self, codes: pd.DataFrame, codes_file: Path):
103        """verify codes in codes file"""
104        conds = np.array([])
105
106        # Iter through each item in check.
107        for msg, cond, process in self.checks:
108            # run conditional check
109            out = cond(codes)
110            conds = np.append(conds, out.all())
111
112        return conds

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Proto(name: str, trud_codes_path: Optional[pathlib.Path] = None)
57    def __init__(self, name: str, trud_codes_path: Optional[Path] = None):
58        if trud_codes_path is not None:
59            if trud_codes_path.is_file():
60                self.trud_codes_path: Path = trud_codes_path
61                self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path)
62            else:
63                raise FileNotFoundError(
64                    f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly"
65                )
66
67        self.name: str = name
checks: list[tuple[str, typing.Callable[[pandas.core.frame.DataFrame], pandas.core.series.Series], typing.Callable[[pandas.core.frame.DataFrame, pathlib.Path], pandas.core.frame.DataFrame]]]
name: str
def raise_exception(self, ex: Exception):
69    def raise_exception(self, ex: Exception):
70        """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict"""
71        raise ex

Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict

def in_database( self, codes: pandas.core.frame.DataFrame, db: pandas.core.frame.DataFrame, col: str) -> pandas.core.frame.DataFrame:
73    def in_database(
74        self, codes: pd.DataFrame, db: pd.DataFrame, col: str
75    ) -> pd.DataFrame:
76        return codes.isin(db[col])
def process( self, codes: pandas.core.frame.DataFrame, codes_file: pathlib.Path) -> Tuple[pandas.core.frame.DataFrame, list]:
 78    def process(
 79        self, codes: pd.DataFrame, codes_file: Path
 80    ) -> Tuple[pd.DataFrame, list]:
 81        """identify issues that do not pass and fix them with define/d process"""
 82        errors = []
 83        # Iter through each item in check.
 84        for msg, cond, fix in self.checks:
 85            # Check if any codes fail the check to False
 86            if not cond(codes).all():
 87                # Log the number of codes that failed
 88                logger.debug(
 89                    f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix"
 90                )
 91                # try fix errors by running lamba "process" function
 92                try:
 93                    codes = fix(codes, codes_file)
 94                    logger.debug(f"Check: Fixed")
 95                except InvalidCodesException as ex:
 96                    errors.append(ex.error)
 97            else:
 98                logger.debug(f"Check: passed")
 99
100        return codes, errors

identify issues that do not pass and fix them with define/d process

def verify(self, codes: pandas.core.frame.DataFrame, codes_file: pathlib.Path):
102    def verify(self, codes: pd.DataFrame, codes_file: Path):
103        """verify codes in codes file"""
104        conds = np.array([])
105
106        # Iter through each item in check.
107        for msg, cond, process in self.checks:
108            # run conditional check
109            out = cond(codes)
110            conds = np.append(conds, out.all())
111
112        return conds

verify codes in codes file

class Read2(Proto):
115class Read2(Proto):
116    """This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary."""
117
118    def __init__(self):
119        super().__init__("read2", trud.PROCESSED_PATH / "read2.parquet")
120
121        # validate checks
122        self.checks = [
123            (
124                # check codes are not empty, if empty throw an exception
125                "Not Empty",
126                lambda codes: pd.Series([len(codes) > 0]),
127                lambda codes, codes_file: self.raise_exception(
128                    InvalidCodesException(
129                        CodesError(
130                            f"Code list is empty",
131                            codes=codes,
132                            codes_file=codes_file,
133                            mask=None,
134                            code_type=self.name,
135                        )
136                    )
137                ),
138            ),
139            (
140                # check codes <5 characters, if too short pads it with . (dots) to reach 5 characters
141                "Too Short",
142                lambda codes: ~(codes.str.len() < 5),
143                lambda codes, codes_file: codes.str.pad(
144                    width=5, side="right", fillchar="."
145                ),
146            ),
147            (
148                # check codes > 5 characters, If too long, truncates them to 5 characters
149                "Too Long",
150                lambda codes: ~(codes.str.len() > 5),
151                lambda codes, codes_file: codes.str[:5],
152            ),
153            (
154                # checks codes contain numbers, or dots (.), if not logs invalid code error
155                "Alphanumeric Dot",
156                lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"),
157                lambda codes, codes_file: self.raise_exception(
158                    InvalidCodesException(
159                        CodesError(
160                            f"Illegal code format, not alphanumeric dot",
161                            codes=codes,
162                            codes_file=codes_file,
163                            mask=codes.str.match(r"^[a-zA-Z0-9.]+$"),
164                            code_type=self.name,
165                        )
166                    )
167                ),
168            ),
169            (
170                # checks code exists in self.db (the Read2 dataset). If missing log invalid codes.
171                "In Database",
172                lambda codes: self.in_database(codes, self.db, self.name),
173                lambda codes, codes_file: self.raise_exception(
174                    InvalidCodesException(
175                        CodesError(
176                            f"Codes do not exist in database",
177                            codes=codes,
178                            codes_file=codes_file,
179                            mask=self.in_database(codes, self.db, self.name),
180                            code_type=self.name,
181                        )
182                    )
183                ),
184            ),
185        ]

This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary.

checks
class Read3(Proto):
188class Read3(Proto):
189    def __init__(self):
190        super().__init__("Read3", trud.PROCESSED_PATH / "read3.parquet")
191
192        self.checks = [
193            (
194                "Not Empty",
195                lambda codes: pd.Series([len(codes) > 0]),
196                lambda codes, codes_file: self.raise_exception(
197                    InvalidCodesException(
198                        CodesError(
199                            f"Code list is empty",
200                            codes=codes,
201                            codes_file=codes_file,
202                            mask=None,
203                            code_type=self.name,
204                        )
205                    )
206                ),
207            ),
208            (
209                "Too Short",
210                lambda codes: ~(codes.str.len() < 5),
211                lambda codes, codes_file: codes.str.pad(
212                    width=5, side="right", fillchar="."
213                ),
214            ),
215            (
216                "Too Long",
217                lambda codes: ~(codes.str.len() > 5),
218                lambda codes, codes_file: codes.str[:5],
219            ),
220            (
221                "Alphanumeric Dot",
222                lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"),
223                lambda codes, codes_file: self.raise_exception(
224                    InvalidCodesException(
225                        CodesError(
226                            f"QA Alphanumeric Dot",
227                            codes=codes,
228                            codes_file=codes_file,
229                            check_regex=codes.str.match(r"^[a-zA-Z0-9.]+$"),
230                            code_type=self.name,
231                        )
232                    )
233                ),
234            ),
235            (
236                "In Database",
237                lambda codes: self.in_database(codes, self.db, self.name),
238                lambda codes, codes_file: self.raise_exception(
239                    InvalidCodesException(
240                        CodesError(
241                            f"QA In Database",
242                            codes=codes,
243                            codes_file=codes_file,
244                            check_regex=self.in_database(codes, self.db, self.name),
245                            code_type=self.name,
246                        )
247                    )
248                ),
249            ),
250        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Icd10(Proto):
253class Icd10(Proto):
254    def __init__(self):
255        super().__init__("icd10", trud.PROCESSED_PATH / "icd10.parquet")
256
257        self.checks = [
258            (
259                "Not Empty",
260                lambda codes: pd.Series([len(codes) > 0]),
261                lambda codes, codes_file: self.raise_exception(
262                    InvalidCodesException(
263                        CodesError(
264                            f"Code list is empty {codes_file}",
265                            codes=codes,
266                            codes_file=codes_file,
267                            mask=None,
268                            code_type=self.name,
269                        )
270                    )
271                ),
272            ),
273            (
274                "Too Short",
275                lambda codes: ~(codes.str.len() < 3),
276                lambda codes, codes_file: self.raise_exception(
277                    InvalidCodesException(
278                        CodesError(
279                            f"QA Too Short",
280                            codes=codes,
281                            codes_file=codes_file,
282                            mask=~(codes.str.len() < 3),
283                            code_type=self.name,
284                        )
285                    )
286                ),
287            ),
288            (
289                "Has Dot",
290                lambda codes: ~(codes.str.match(r".*\..*")),  # check if contains dot
291                lambda codes, codes_file: codes.str.replace(
292                    ".", ""
293                ),  # delete any dots in string
294                # lambda codes : codes.str.split('\.').apply(lambda ls: ls[0]) #only get part before dot
295            ),
296            (
297                "Alphanumeric Capital",
298                lambda codes: codes.str.match(r"^[A-Z0-9]+$"),
299                lambda codes, codes_file: self.raise_exception(
300                    InvalidCodesException(
301                        CodesError(
302                            f"QA Alphanumeric Capital",
303                            codes=codes,
304                            codes_file=codes_file,
305                            mask=codes.str.match(r"^[A-Z0-9]+$"),
306                            code_type=self.name,
307                        )
308                    )
309                ),
310            ),
311            (
312                "In Database",
313                lambda codes: ~(
314                    ~self.in_database(codes, self.db, self.name)
315                    & ~self.in_database(codes, self.db, self.name + "_alt")
316                ),
317                lambda codes, codes_file: self.raise_exception(
318                    InvalidCodesException(
319                        CodesError(
320                            f"QA In Database",
321                            codes=codes,
322                            codes_file=codes_file,
323                            mask=~(
324                                ~self.in_database(codes, self.db, self.name)
325                                & ~self.in_database(codes, self.db, self.name + "_alt")
326                            ),
327                            code_type=self.name,
328                        )
329                    )
330                ),
331            ),
332            # 			(
333            # 				"ICD10 Regex",
334            # 				lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum*
335            # 				lambda codes : lc.log_invalid_code(codes,
336            # 												codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows
337            # 												code_type="icd10",
338            #
339            # 			)
340        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Snomed(Proto):
343class Snomed(Proto):
344    def __init__(self):
345        super().__init__("snomed", trud.PROCESSED_PATH / "snomed.parquet")
346
347        self.checks = [
348            # (
349            # 	"Not Empty",
350            # 	lambda codes : pd.Series([len(codes) > 0]),
351            # 	lambda codes : raise_exception(Exception("Code List is Empty"))
352            # ),
353            (
354                "Too Short",
355                lambda codes: ~(codes.str.len() < 6),
356                lambda codes, codes_file: self.raise_exception(
357                    InvalidCodesException(
358                        CodesError(
359                            f"QA Too Short",
360                            codes=codes,
361                            codes_file=codes_file,
362                            mask=~(codes.str.len() < 6),
363                            code_type=self.name,
364                        )
365                    )
366                ),
367            ),
368            (
369                "Too Long",
370                lambda codes: ~(codes.str.len() > 18),
371                lambda codes, codes_file: self.raise_exception(
372                    InvalidCodesException(
373                        CodesError(
374                            f"QA Too Long",
375                            codes=codes,
376                            codes_file=codes_file,
377                            mask=~(codes.str.len() > 18),
378                            code_type=self.name,
379                        )
380                    )
381                ),
382            ),
383            (
384                "Numeric",
385                lambda codes: codes.str.match(r"[0-9]+$"),
386                lambda codes, codes_file: self.raise_exception(
387                    InvalidCodesException(
388                        CodesError(
389                            f"QA Numeric",
390                            codes=codes,
391                            codes_file=codes_file,
392                            mask=codes.str.match(r"[0-9]+$"),
393                            code_type=self.name,
394                        )
395                    )
396                ),
397            ),
398            # (
399            # 	"Is Integer",
400            # 	lambda codes : codes.dtype == int,
401            # 	lambda codes : codes.astype(int) #Convert to integer
402            # ),
403            (
404                "In Database",
405                lambda codes: self.in_database(codes, self.db, self.name),
406                lambda codes, codes_file: self.raise_exception(
407                    InvalidCodesException(
408                        CodesError(
409                            f"QA In Database",
410                            codes=codes,
411                            codes_file=codes_file,
412                            mask=self.in_database(codes, self.db, self.name),
413                            code_type=self.name,
414                        )
415                    )
416                ),
417            ),
418        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Opcs4(Proto):
421class Opcs4(Proto):
422    def __init__(self):
423        super().__init__("opcs4", trud.PROCESSED_PATH / "opcs4.parquet")
424
425        self.checks = [
426            (
427                "Not Empty",
428                lambda codes: pd.Series([len(codes) > 0]),
429                lambda codes, codes_file: self.raise_exception(
430                    InvalidCodesException(
431                        CodesError(
432                            f"Code list is empty",
433                            codes=codes,
434                            codes_file=codes_file,
435                            mask=None,
436                            code_type=self.name,
437                        )
438                    )
439                ),
440            ),
441            (
442                "In Database",
443                lambda codes: self.in_database(codes, self.db, self.name),
444                lambda codes, codes_file: self.raise_exception(
445                    InvalidCodesException(
446                        CodesError(
447                            f"QA In Database",
448                            codes=codes,
449                            codes_file=codes_file,
450                            mask=self.in_database(codes, self.db, self.name),
451                            code_type=self.name,
452                        )
453                    )
454                ),
455            ),
456        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Atc(Proto):
459class Atc(Proto):
460    def __init__(self):
461        super().__init__("atc", trud_codes_path=None)
462        self.checks = [
463            (
464                "Not Empty",
465                lambda codes: pd.Series([len(codes) > 0]),
466                lambda codes, codes_file: self.raise_exception(
467                    InvalidCodesException(
468                        CodesError(
469                            f"Code list is empty",
470                            codes=codes,
471                            codes_file=codes_file,
472                            mask=None,
473                            code_type=self.name,
474                        )
475                    )
476                ),
477            ),
478            (
479                "Alphanumeric Capital",
480                lambda codes: codes.str.match(r"^[A-Z0-9]+$"),
481                lambda codes, codes_file: self.raise_exception(
482                    InvalidCodesException(
483                        CodesError(
484                            f"QA Alphanumeric Capital",
485                            codes=codes,
486                            codes_file=codes_file,
487                            mask=codes.str.match(r"^[A-Z0-9]+$"),
488                            code_type=self.name,
489                        )
490                    )
491                ),
492            ),
493        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Med(Proto):
496class Med(Proto):
497    def __init__(self):
498        super().__init__("med", trud_codes_path=None)
499        self.checks = [
500            (
501                "Not Empty",
502                lambda codes: pd.Series([len(codes) > 0]),
503                lambda codes, codes_file: self.raise_exception(
504                    InvalidCodesException(
505                        CodesError(
506                            f"Code list is empty",
507                            codes=codes,
508                            codes_file=codes_file,
509                            mask=None,
510                            code_type=self.name,
511                        )
512                    )
513                ),
514            )
515        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Cprd(Proto):
518class Cprd(Proto):
519    def __init__(self):
520        super().__init__("cprd", trud_codes_path=None)
521        self.checks = [
522            (
523                "Not Empty",
524                lambda codes: pd.Series([len(codes) > 0]),
525                lambda codes, codes_file: self.raise_exception(
526                    InvalidCodesException(
527                        CodesError(
528                            f"Code list is empty",
529                            codes=codes,
530                            codes_file=codes_file,
531                            mask=None,
532                            code_type=self.name,
533                        )
534                    )
535                ),
536            )
537        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class CodeTypeParser:
540class CodeTypeParser:
541    """A class used in InvalidCodesException to report an error if a code parser check fails"""
542
543    def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH):
544        if not trud_processed_dir.exists() or not trud_processed_dir.is_dir():
545            raise FileNotFoundError(
546                f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install"
547            )
548
549        self.code_types = {
550            "read2": Read2(),
551            "read3": Read3(),
552            "icd10": Icd10(),
553            "snomed": Snomed(),
554            "opcs4": Opcs4(),
555            "atc": Atc(),
556            "med": Med(),
557            "cprd": Cprd(),
558        }

A class used in InvalidCodesException to report an error if a code parser check fails

CodeTypeParser(trud_processed_dir: pathlib.Path = PosixPath('vocab/trud/processed'))
543    def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH):
544        if not trud_processed_dir.exists() or not trud_processed_dir.is_dir():
545            raise FileNotFoundError(
546                f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install"
547            )
548
549        self.code_types = {
550            "read2": Read2(),
551            "read3": Read3(),
552            "icd10": Icd10(),
553            "snomed": Snomed(),
554            "opcs4": Opcs4(),
555            "atc": Atc(),
556            "med": Med(),
557            "cprd": Cprd(),
558        }
code_types