acmc.parse

parse.py module

This module provides functionality to set up medical code translation classes

  1"""
  2parse.py module
  3
  4This module provides functionality to set up medical code translation classes
  5
  6"""
  7
  8import pandas as pd
  9import numpy as np
 10import os
 11from typing import Callable, Optional, Tuple
 12from pathlib import Path
 13from acmc import trud, logging_config as lc
 14
 15# setup logging
 16_logger = lc.setup_logger()
 17
 18SUPPORTED_CODE_TYPES = {"read2", "read3", "icd10", "snomed", "opcs4", "atc"}
 19"""List of support medical coding types"""
 20
 21
 22class CodesError:
 23    """A class used in InvalidCodesException to report an error if a code parser check fails"""
 24
 25    def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None):
 26        # initialise class variables with provided parameters
 27        for key, value in locals().items():
 28            if key != "self":
 29                setattr(self, key, value)
 30
 31
 32class InvalidCodesException(Exception):
 33    """Custom exception class raised when invalid codes are found that cannot be resolved by processing"""
 34
 35    def __init__(self, error):
 36        super().__init__(error.message)
 37        self.error = error
 38
 39
 40class Proto:
 41    """
 42    Define checks as list of 3 tuple: (Message, Condition, Process)
 43    - Message = The name of the condition (what is printed and logged)
 44    - Condition = True if Passed, and False if Failed
 45    - Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
 46    """
 47
 48    checks: list[
 49        tuple[
 50            str,  # The description, e.g., "Not Empty"
 51            Callable[
 52                [pd.DataFrame],
 53                pd.Series,
 54            ],  # The first lambda function: takes a list and returns a pd.Series of booleans
 55            Callable[
 56                [pd.DataFrame, Path],
 57                pd.DataFrame,
 58            ],  # The second lambda function: takes a list and a string, and returns nothing
 59        ]
 60    ]
 61
 62    def __init__(self, name: str, trud_codes_path: Optional[Path] = None):
 63        if trud_codes_path is not None:
 64            if trud_codes_path.is_file():
 65                self.trud_codes_path: Path = trud_codes_path
 66                self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path)
 67            else:
 68                raise FileNotFoundError(
 69                    f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly"
 70                )
 71
 72        self.name: str = name
 73
 74    def raise_exception(self, ex: Exception):
 75        """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict"""
 76        raise ex
 77
 78    def in_database(
 79        self, codes: pd.DataFrame, db: pd.DataFrame, col: str
 80    ) -> pd.DataFrame:
 81        return codes.isin(db[col])
 82
 83    def process(
 84        self, codes: pd.DataFrame, codes_file: Path
 85    ) -> Tuple[pd.DataFrame, list]:
 86        """identify issues that do not pass and fix them with define/d process"""
 87        errors = []
 88        # Iter through each item in check.
 89        for msg, cond, fix in self.checks:
 90            # Check if any codes fail the check to False
 91            if not cond(codes).all():
 92                # Log the number of codes that failed
 93                _logger.debug(
 94                    f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix"
 95                )
 96                # try fix errors by running lamba "process" function
 97                try:
 98                    codes = fix(codes, codes_file)
 99                    _logger.debug(f"Check: Fixed")
100                except InvalidCodesException as ex:
101                    errors.append(ex.error)
102            else:
103                _logger.debug(f"Check: passed")
104
105        return codes, errors
106
107    def verify(self, codes: pd.DataFrame, codes_file: Path):
108        """verify codes in codes file"""
109        conds = np.array([])
110
111        # Iter through each item in check.
112        for msg, cond, process in self.checks:
113            # run conditional check
114            out = cond(codes)
115            conds = np.append(conds, out.all())
116
117        return conds
118
119
120class Read2(Proto):
121    """This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary."""
122
123    def __init__(self):
124        super().__init__("read2", trud.PROCESSED_PATH / "read2.parquet")
125
126        # validate checks
127        self.checks = [
128            (
129                # check codes are not empty, if empty throw an exception
130                "Not Empty",
131                lambda codes: pd.Series([len(codes) > 0]),
132                lambda codes, codes_file: self.raise_exception(
133                    InvalidCodesException(
134                        CodesError(
135                            f"Code list is empty",
136                            codes=codes,
137                            codes_file=codes_file,
138                            mask=None,
139                            code_type=self.name,
140                        )
141                    )
142                ),
143            ),
144            (
145                # check codes <5 characters, if too short pads it with . (dots) to reach 5 characters
146                "Too Short",
147                lambda codes: ~(codes.str.len() < 5),
148                lambda codes, codes_file: codes.str.pad(
149                    width=5, side="right", fillchar="."
150                ),
151            ),
152            (
153                # check codes > 5 characters, If too long, truncates them to 5 characters
154                "Too Long",
155                lambda codes: ~(codes.str.len() > 5),
156                lambda codes, codes_file: codes.str[:5],
157            ),
158            (
159                # checks codes contain numbers, or dots (.), if not logs invalid code error
160                "Alphanumeric Dot",
161                lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"),
162                lambda codes, codes_file: self.raise_exception(
163                    InvalidCodesException(
164                        CodesError(
165                            f"Illegal code format, not alphanumeric dot",
166                            codes=codes,
167                            codes_file=codes_file,
168                            mask=codes.str.match(r"^[a-zA-Z0-9.]+$"),
169                            code_type=self.name,
170                        )
171                    )
172                ),
173            ),
174            (
175                # checks code exists in self.db (the Read2 dataset). If missing log invalid codes.
176                "In Database",
177                lambda codes: self.in_database(codes, self.db, self.name),
178                lambda codes, codes_file: self.raise_exception(
179                    InvalidCodesException(
180                        CodesError(
181                            f"Codes do not exist in database",
182                            codes=codes,
183                            codes_file=codes_file,
184                            mask=self.in_database(codes, self.db, self.name),
185                            code_type=self.name,
186                        )
187                    )
188                ),
189            ),
190        ]
191
192
193class Read3(Proto):
194    def __init__(self):
195        super().__init__("Read3", trud.PROCESSED_PATH / "read3.parquet")
196
197        self.checks = [
198            (
199                "Not Empty",
200                lambda codes: pd.Series([len(codes) > 0]),
201                lambda codes, codes_file: self.raise_exception(
202                    InvalidCodesException(
203                        CodesError(
204                            f"Code list is empty",
205                            codes=codes,
206                            codes_file=codes_file,
207                            mask=None,
208                            code_type=self.name,
209                        )
210                    )
211                ),
212            ),
213            (
214                "Too Short",
215                lambda codes: ~(codes.str.len() < 5),
216                lambda codes, codes_file: codes.str.pad(
217                    width=5, side="right", fillchar="."
218                ),
219            ),
220            (
221                "Too Long",
222                lambda codes: ~(codes.str.len() > 5),
223                lambda codes, codes_file: codes.str[:5],
224            ),
225            (
226                "Alphanumeric Dot",
227                lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"),
228                lambda codes, codes_file: self.raise_exception(
229                    InvalidCodesException(
230                        CodesError(
231                            f"QA Alphanumeric Dot",
232                            codes=codes,
233                            codes_file=codes_file,
234                            check_regex=codes.str.match(r"^[a-zA-Z0-9.]+$"),
235                            code_type=self.name,
236                        )
237                    )
238                ),
239            ),
240            (
241                "In Database",
242                lambda codes: self.in_database(codes, self.db, self.name),
243                lambda codes, codes_file: self.raise_exception(
244                    InvalidCodesException(
245                        CodesError(
246                            f"QA In Database",
247                            codes=codes,
248                            codes_file=codes_file,
249                            check_regex=self.in_database(codes, self.db, self.name),
250                            code_type=self.name,
251                        )
252                    )
253                ),
254            ),
255        ]
256
257
258class Icd10(Proto):
259    def __init__(self):
260        super().__init__("icd10", trud.PROCESSED_PATH / "icd10.parquet")
261
262        self.checks = [
263            (
264                "Not Empty",
265                lambda codes: pd.Series([len(codes) > 0]),
266                lambda codes, codes_file: self.raise_exception(
267                    InvalidCodesException(
268                        CodesError(
269                            f"Code list is empty {codes_file}",
270                            codes=codes,
271                            codes_file=codes_file,
272                            mask=None,
273                            code_type=self.name,
274                        )
275                    )
276                ),
277            ),
278            (
279                "Too Short",
280                lambda codes: ~(codes.str.len() < 3),
281                lambda codes, codes_file: self.raise_exception(
282                    InvalidCodesException(
283                        CodesError(
284                            f"QA Too Short",
285                            codes=codes,
286                            codes_file=codes_file,
287                            mask=~(codes.str.len() < 3),
288                            code_type=self.name,
289                        )
290                    )
291                ),
292            ),
293            (
294                "Has Dot",
295                lambda codes: ~(codes.str.match(r".*\..*")),  # check if contains dot
296                lambda codes, codes_file: codes.str.replace(
297                    ".", ""
298                ),  # delete any dots in string
299                # lambda codes : codes.str.split('\.').apply(lambda ls: ls[0]) #only get part before dot
300            ),
301            (
302                "Alphanumeric Capital",
303                lambda codes: codes.str.match(r"^[A-Z0-9]+$"),
304                lambda codes, codes_file: self.raise_exception(
305                    InvalidCodesException(
306                        CodesError(
307                            f"QA Alphanumeric Capital",
308                            codes=codes,
309                            codes_file=codes_file,
310                            mask=codes.str.match(r"^[A-Z0-9]+$"),
311                            code_type=self.name,
312                        )
313                    )
314                ),
315            ),
316            (
317                "In Database",
318                lambda codes: ~(
319                    ~self.in_database(codes, self.db, self.name)
320                    & ~self.in_database(codes, self.db, self.name + "_alt")
321                ),
322                lambda codes, codes_file: self.raise_exception(
323                    InvalidCodesException(
324                        CodesError(
325                            f"QA In Database",
326                            codes=codes,
327                            codes_file=codes_file,
328                            mask=~(
329                                ~self.in_database(codes, self.db, self.name)
330                                & ~self.in_database(codes, self.db, self.name + "_alt")
331                            ),
332                            code_type=self.name,
333                        )
334                    )
335                ),
336            ),
337            # 			(
338            # 				"ICD10 Regex",
339            # 				lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum*
340            # 				lambda codes : lc.log_invalid_code(codes,
341            # 												codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows
342            # 												code_type="icd10",
343            #
344            # 			)
345        ]
346
347
348class Snomed(Proto):
349    def __init__(self):
350        super().__init__("snomed", trud.PROCESSED_PATH / "snomed.parquet")
351
352        self.checks = [
353            # (
354            # 	"Not Empty",
355            # 	lambda codes : pd.Series([len(codes) > 0]),
356            # 	lambda codes : raise_exception(Exception("Code List is Empty"))
357            # ),
358            (
359                "Too Short",
360                lambda codes: ~(codes.str.len() < 6),
361                lambda codes, codes_file: self.raise_exception(
362                    InvalidCodesException(
363                        CodesError(
364                            f"QA Too Short",
365                            codes=codes,
366                            codes_file=codes_file,
367                            mask=~(codes.str.len() < 6),
368                            code_type=self.name,
369                        )
370                    )
371                ),
372            ),
373            (
374                "Too Long",
375                lambda codes: ~(codes.str.len() > 18),
376                lambda codes, codes_file: self.raise_exception(
377                    InvalidCodesException(
378                        CodesError(
379                            f"QA Too Long",
380                            codes=codes,
381                            codes_file=codes_file,
382                            mask=~(codes.str.len() > 18),
383                            code_type=self.name,
384                        )
385                    )
386                ),
387            ),
388            (
389                "Numeric",
390                lambda codes: codes.str.match(r"[0-9]+$"),
391                lambda codes, codes_file: self.raise_exception(
392                    InvalidCodesException(
393                        CodesError(
394                            f"QA Numeric",
395                            codes=codes,
396                            codes_file=codes_file,
397                            mask=codes.str.match(r"[0-9]+$"),
398                            code_type=self.name,
399                        )
400                    )
401                ),
402            ),
403            # (
404            # 	"Is Integer",
405            # 	lambda codes : codes.dtype == int,
406            # 	lambda codes : codes.astype(int) #Convert to integer
407            # ),
408            (
409                "In Database",
410                lambda codes: self.in_database(codes, self.db, self.name),
411                lambda codes, codes_file: self.raise_exception(
412                    InvalidCodesException(
413                        CodesError(
414                            f"QA In Database",
415                            codes=codes,
416                            codes_file=codes_file,
417                            mask=self.in_database(codes, self.db, self.name),
418                            code_type=self.name,
419                        )
420                    )
421                ),
422            ),
423        ]
424
425
426class Opcs4(Proto):
427    def __init__(self):
428        super().__init__("opcs4", trud.PROCESSED_PATH / "opcs4.parquet")
429
430        self.checks = [
431            (
432                "Not Empty",
433                lambda codes: pd.Series([len(codes) > 0]),
434                lambda codes, codes_file: self.raise_exception(
435                    InvalidCodesException(
436                        CodesError(
437                            f"Code list is empty",
438                            codes=codes,
439                            codes_file=codes_file,
440                            mask=None,
441                            code_type=self.name,
442                        )
443                    )
444                ),
445            ),
446            (
447                "In Database",
448                lambda codes: self.in_database(codes, self.db, self.name),
449                lambda codes, codes_file: self.raise_exception(
450                    InvalidCodesException(
451                        CodesError(
452                            f"QA In Database",
453                            codes=codes,
454                            codes_file=codes_file,
455                            mask=self.in_database(codes, self.db, self.name),
456                            code_type=self.name,
457                        )
458                    )
459                ),
460            ),
461        ]
462
463
464class Atc(Proto):
465    def __init__(self):
466        super().__init__("atc", trud_codes_path=None)
467        self.checks = [
468            (
469                "Not Empty",
470                lambda codes: pd.Series([len(codes) > 0]),
471                lambda codes, codes_file: self.raise_exception(
472                    InvalidCodesException(
473                        CodesError(
474                            f"Code list is empty",
475                            codes=codes,
476                            codes_file=codes_file,
477                            mask=None,
478                            code_type=self.name,
479                        )
480                    )
481                ),
482            ),
483            (
484                "Alphanumeric Capital",
485                lambda codes: codes.str.match(r"^[A-Z0-9]+$"),
486                lambda codes, codes_file: self.raise_exception(
487                    InvalidCodesException(
488                        CodesError(
489                            f"QA Alphanumeric Capital",
490                            codes=codes,
491                            codes_file=codes_file,
492                            mask=codes.str.match(r"^[A-Z0-9]+$"),
493                            code_type=self.name,
494                        )
495                    )
496                ),
497            ),
498        ]
499
500
501class Med(Proto):
502    def __init__(self):
503        super().__init__("med", trud_codes_path=None)
504        self.checks = [
505            (
506                "Not Empty",
507                lambda codes: pd.Series([len(codes) > 0]),
508                lambda codes, codes_file: self.raise_exception(
509                    InvalidCodesException(
510                        CodesError(
511                            f"Code list is empty",
512                            codes=codes,
513                            codes_file=codes_file,
514                            mask=None,
515                            code_type=self.name,
516                        )
517                    )
518                ),
519            )
520        ]
521
522
523class Cprd(Proto):
524    def __init__(self):
525        super().__init__("cprd", trud_codes_path=None)
526        self.checks = [
527            (
528                "Not Empty",
529                lambda codes: pd.Series([len(codes) > 0]),
530                lambda codes, codes_file: self.raise_exception(
531                    InvalidCodesException(
532                        CodesError(
533                            f"Code list is empty",
534                            codes=codes,
535                            codes_file=codes_file,
536                            mask=None,
537                            code_type=self.name,
538                        )
539                    )
540                ),
541            )
542        ]
543
544
545class CodeTypeParser:
546    """A class used in InvalidCodesException to report an error if a code parser check fails"""
547
548    def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH):
549        if not trud_processed_dir.exists() or not trud_processed_dir.is_dir():
550            raise FileNotFoundError(
551                f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install"
552            )
553
554        self.code_types = {
555            "read2": Read2(),
556            "read3": Read3(),
557            "icd10": Icd10(),
558            "snomed": Snomed(),
559            "opcs4": Opcs4(),
560            "atc": Atc(),
561            "med": Med(),
562            "cprd": Cprd(),
563        }
SUPPORTED_CODE_TYPES = {'icd10', 'read2', 'snomed', 'atc', 'opcs4', 'read3'}

List of support medical coding types

class CodesError:
23class CodesError:
24    """A class used in InvalidCodesException to report an error if a code parser check fails"""
25
26    def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None):
27        # initialise class variables with provided parameters
28        for key, value in locals().items():
29            if key != "self":
30                setattr(self, key, value)

A class used in InvalidCodesException to report an error if a code parser check fails

CodesError(message, codes=None, codes_file=None, mask=None, code_type=None)
26    def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None):
27        # initialise class variables with provided parameters
28        for key, value in locals().items():
29            if key != "self":
30                setattr(self, key, value)
class InvalidCodesException(builtins.Exception):
33class InvalidCodesException(Exception):
34    """Custom exception class raised when invalid codes are found that cannot be resolved by processing"""
35
36    def __init__(self, error):
37        super().__init__(error.message)
38        self.error = error

Custom exception class raised when invalid codes are found that cannot be resolved by processing

InvalidCodesException(error)
36    def __init__(self, error):
37        super().__init__(error.message)
38        self.error = error
error
class Proto:
 41class Proto:
 42    """
 43    Define checks as list of 3 tuple: (Message, Condition, Process)
 44    - Message = The name of the condition (what is printed and logged)
 45    - Condition = True if Passed, and False if Failed
 46    - Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
 47    """
 48
 49    checks: list[
 50        tuple[
 51            str,  # The description, e.g., "Not Empty"
 52            Callable[
 53                [pd.DataFrame],
 54                pd.Series,
 55            ],  # The first lambda function: takes a list and returns a pd.Series of booleans
 56            Callable[
 57                [pd.DataFrame, Path],
 58                pd.DataFrame,
 59            ],  # The second lambda function: takes a list and a string, and returns nothing
 60        ]
 61    ]
 62
 63    def __init__(self, name: str, trud_codes_path: Optional[Path] = None):
 64        if trud_codes_path is not None:
 65            if trud_codes_path.is_file():
 66                self.trud_codes_path: Path = trud_codes_path
 67                self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path)
 68            else:
 69                raise FileNotFoundError(
 70                    f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly"
 71                )
 72
 73        self.name: str = name
 74
 75    def raise_exception(self, ex: Exception):
 76        """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict"""
 77        raise ex
 78
 79    def in_database(
 80        self, codes: pd.DataFrame, db: pd.DataFrame, col: str
 81    ) -> pd.DataFrame:
 82        return codes.isin(db[col])
 83
 84    def process(
 85        self, codes: pd.DataFrame, codes_file: Path
 86    ) -> Tuple[pd.DataFrame, list]:
 87        """identify issues that do not pass and fix them with define/d process"""
 88        errors = []
 89        # Iter through each item in check.
 90        for msg, cond, fix in self.checks:
 91            # Check if any codes fail the check to False
 92            if not cond(codes).all():
 93                # Log the number of codes that failed
 94                _logger.debug(
 95                    f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix"
 96                )
 97                # try fix errors by running lamba "process" function
 98                try:
 99                    codes = fix(codes, codes_file)
100                    _logger.debug(f"Check: Fixed")
101                except InvalidCodesException as ex:
102                    errors.append(ex.error)
103            else:
104                _logger.debug(f"Check: passed")
105
106        return codes, errors
107
108    def verify(self, codes: pd.DataFrame, codes_file: Path):
109        """verify codes in codes file"""
110        conds = np.array([])
111
112        # Iter through each item in check.
113        for msg, cond, process in self.checks:
114            # run conditional check
115            out = cond(codes)
116            conds = np.append(conds, out.all())
117
118        return conds

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Proto(name: str, trud_codes_path: Optional[pathlib.Path] = None)
63    def __init__(self, name: str, trud_codes_path: Optional[Path] = None):
64        if trud_codes_path is not None:
65            if trud_codes_path.is_file():
66                self.trud_codes_path: Path = trud_codes_path
67                self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path)
68            else:
69                raise FileNotFoundError(
70                    f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly"
71                )
72
73        self.name: str = name
checks: list[tuple[str, typing.Callable[[pandas.core.frame.DataFrame], pandas.core.series.Series], typing.Callable[[pandas.core.frame.DataFrame, pathlib.Path], pandas.core.frame.DataFrame]]]
name: str
def raise_exception(self, ex: Exception):
75    def raise_exception(self, ex: Exception):
76        """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict"""
77        raise ex

Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict

def in_database( self, codes: pandas.core.frame.DataFrame, db: pandas.core.frame.DataFrame, col: str) -> pandas.core.frame.DataFrame:
79    def in_database(
80        self, codes: pd.DataFrame, db: pd.DataFrame, col: str
81    ) -> pd.DataFrame:
82        return codes.isin(db[col])
def process( self, codes: pandas.core.frame.DataFrame, codes_file: pathlib.Path) -> Tuple[pandas.core.frame.DataFrame, list]:
 84    def process(
 85        self, codes: pd.DataFrame, codes_file: Path
 86    ) -> Tuple[pd.DataFrame, list]:
 87        """identify issues that do not pass and fix them with define/d process"""
 88        errors = []
 89        # Iter through each item in check.
 90        for msg, cond, fix in self.checks:
 91            # Check if any codes fail the check to False
 92            if not cond(codes).all():
 93                # Log the number of codes that failed
 94                _logger.debug(
 95                    f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix"
 96                )
 97                # try fix errors by running lamba "process" function
 98                try:
 99                    codes = fix(codes, codes_file)
100                    _logger.debug(f"Check: Fixed")
101                except InvalidCodesException as ex:
102                    errors.append(ex.error)
103            else:
104                _logger.debug(f"Check: passed")
105
106        return codes, errors

identify issues that do not pass and fix them with define/d process

def verify(self, codes: pandas.core.frame.DataFrame, codes_file: pathlib.Path):
108    def verify(self, codes: pd.DataFrame, codes_file: Path):
109        """verify codes in codes file"""
110        conds = np.array([])
111
112        # Iter through each item in check.
113        for msg, cond, process in self.checks:
114            # run conditional check
115            out = cond(codes)
116            conds = np.append(conds, out.all())
117
118        return conds

verify codes in codes file

class Read2(Proto):
121class Read2(Proto):
122    """This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary."""
123
124    def __init__(self):
125        super().__init__("read2", trud.PROCESSED_PATH / "read2.parquet")
126
127        # validate checks
128        self.checks = [
129            (
130                # check codes are not empty, if empty throw an exception
131                "Not Empty",
132                lambda codes: pd.Series([len(codes) > 0]),
133                lambda codes, codes_file: self.raise_exception(
134                    InvalidCodesException(
135                        CodesError(
136                            f"Code list is empty",
137                            codes=codes,
138                            codes_file=codes_file,
139                            mask=None,
140                            code_type=self.name,
141                        )
142                    )
143                ),
144            ),
145            (
146                # check codes <5 characters, if too short pads it with . (dots) to reach 5 characters
147                "Too Short",
148                lambda codes: ~(codes.str.len() < 5),
149                lambda codes, codes_file: codes.str.pad(
150                    width=5, side="right", fillchar="."
151                ),
152            ),
153            (
154                # check codes > 5 characters, If too long, truncates them to 5 characters
155                "Too Long",
156                lambda codes: ~(codes.str.len() > 5),
157                lambda codes, codes_file: codes.str[:5],
158            ),
159            (
160                # checks codes contain numbers, or dots (.), if not logs invalid code error
161                "Alphanumeric Dot",
162                lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"),
163                lambda codes, codes_file: self.raise_exception(
164                    InvalidCodesException(
165                        CodesError(
166                            f"Illegal code format, not alphanumeric dot",
167                            codes=codes,
168                            codes_file=codes_file,
169                            mask=codes.str.match(r"^[a-zA-Z0-9.]+$"),
170                            code_type=self.name,
171                        )
172                    )
173                ),
174            ),
175            (
176                # checks code exists in self.db (the Read2 dataset). If missing log invalid codes.
177                "In Database",
178                lambda codes: self.in_database(codes, self.db, self.name),
179                lambda codes, codes_file: self.raise_exception(
180                    InvalidCodesException(
181                        CodesError(
182                            f"Codes do not exist in database",
183                            codes=codes,
184                            codes_file=codes_file,
185                            mask=self.in_database(codes, self.db, self.name),
186                            code_type=self.name,
187                        )
188                    )
189                ),
190            ),
191        ]

This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary.

checks
class Read3(Proto):
194class Read3(Proto):
195    def __init__(self):
196        super().__init__("Read3", trud.PROCESSED_PATH / "read3.parquet")
197
198        self.checks = [
199            (
200                "Not Empty",
201                lambda codes: pd.Series([len(codes) > 0]),
202                lambda codes, codes_file: self.raise_exception(
203                    InvalidCodesException(
204                        CodesError(
205                            f"Code list is empty",
206                            codes=codes,
207                            codes_file=codes_file,
208                            mask=None,
209                            code_type=self.name,
210                        )
211                    )
212                ),
213            ),
214            (
215                "Too Short",
216                lambda codes: ~(codes.str.len() < 5),
217                lambda codes, codes_file: codes.str.pad(
218                    width=5, side="right", fillchar="."
219                ),
220            ),
221            (
222                "Too Long",
223                lambda codes: ~(codes.str.len() > 5),
224                lambda codes, codes_file: codes.str[:5],
225            ),
226            (
227                "Alphanumeric Dot",
228                lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"),
229                lambda codes, codes_file: self.raise_exception(
230                    InvalidCodesException(
231                        CodesError(
232                            f"QA Alphanumeric Dot",
233                            codes=codes,
234                            codes_file=codes_file,
235                            check_regex=codes.str.match(r"^[a-zA-Z0-9.]+$"),
236                            code_type=self.name,
237                        )
238                    )
239                ),
240            ),
241            (
242                "In Database",
243                lambda codes: self.in_database(codes, self.db, self.name),
244                lambda codes, codes_file: self.raise_exception(
245                    InvalidCodesException(
246                        CodesError(
247                            f"QA In Database",
248                            codes=codes,
249                            codes_file=codes_file,
250                            check_regex=self.in_database(codes, self.db, self.name),
251                            code_type=self.name,
252                        )
253                    )
254                ),
255            ),
256        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Icd10(Proto):
259class Icd10(Proto):
260    def __init__(self):
261        super().__init__("icd10", trud.PROCESSED_PATH / "icd10.parquet")
262
263        self.checks = [
264            (
265                "Not Empty",
266                lambda codes: pd.Series([len(codes) > 0]),
267                lambda codes, codes_file: self.raise_exception(
268                    InvalidCodesException(
269                        CodesError(
270                            f"Code list is empty {codes_file}",
271                            codes=codes,
272                            codes_file=codes_file,
273                            mask=None,
274                            code_type=self.name,
275                        )
276                    )
277                ),
278            ),
279            (
280                "Too Short",
281                lambda codes: ~(codes.str.len() < 3),
282                lambda codes, codes_file: self.raise_exception(
283                    InvalidCodesException(
284                        CodesError(
285                            f"QA Too Short",
286                            codes=codes,
287                            codes_file=codes_file,
288                            mask=~(codes.str.len() < 3),
289                            code_type=self.name,
290                        )
291                    )
292                ),
293            ),
294            (
295                "Has Dot",
296                lambda codes: ~(codes.str.match(r".*\..*")),  # check if contains dot
297                lambda codes, codes_file: codes.str.replace(
298                    ".", ""
299                ),  # delete any dots in string
300                # lambda codes : codes.str.split('\.').apply(lambda ls: ls[0]) #only get part before dot
301            ),
302            (
303                "Alphanumeric Capital",
304                lambda codes: codes.str.match(r"^[A-Z0-9]+$"),
305                lambda codes, codes_file: self.raise_exception(
306                    InvalidCodesException(
307                        CodesError(
308                            f"QA Alphanumeric Capital",
309                            codes=codes,
310                            codes_file=codes_file,
311                            mask=codes.str.match(r"^[A-Z0-9]+$"),
312                            code_type=self.name,
313                        )
314                    )
315                ),
316            ),
317            (
318                "In Database",
319                lambda codes: ~(
320                    ~self.in_database(codes, self.db, self.name)
321                    & ~self.in_database(codes, self.db, self.name + "_alt")
322                ),
323                lambda codes, codes_file: self.raise_exception(
324                    InvalidCodesException(
325                        CodesError(
326                            f"QA In Database",
327                            codes=codes,
328                            codes_file=codes_file,
329                            mask=~(
330                                ~self.in_database(codes, self.db, self.name)
331                                & ~self.in_database(codes, self.db, self.name + "_alt")
332                            ),
333                            code_type=self.name,
334                        )
335                    )
336                ),
337            ),
338            # 			(
339            # 				"ICD10 Regex",
340            # 				lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum*
341            # 				lambda codes : lc.log_invalid_code(codes,
342            # 												codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows
343            # 												code_type="icd10",
344            #
345            # 			)
346        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Snomed(Proto):
349class Snomed(Proto):
350    def __init__(self):
351        super().__init__("snomed", trud.PROCESSED_PATH / "snomed.parquet")
352
353        self.checks = [
354            # (
355            # 	"Not Empty",
356            # 	lambda codes : pd.Series([len(codes) > 0]),
357            # 	lambda codes : raise_exception(Exception("Code List is Empty"))
358            # ),
359            (
360                "Too Short",
361                lambda codes: ~(codes.str.len() < 6),
362                lambda codes, codes_file: self.raise_exception(
363                    InvalidCodesException(
364                        CodesError(
365                            f"QA Too Short",
366                            codes=codes,
367                            codes_file=codes_file,
368                            mask=~(codes.str.len() < 6),
369                            code_type=self.name,
370                        )
371                    )
372                ),
373            ),
374            (
375                "Too Long",
376                lambda codes: ~(codes.str.len() > 18),
377                lambda codes, codes_file: self.raise_exception(
378                    InvalidCodesException(
379                        CodesError(
380                            f"QA Too Long",
381                            codes=codes,
382                            codes_file=codes_file,
383                            mask=~(codes.str.len() > 18),
384                            code_type=self.name,
385                        )
386                    )
387                ),
388            ),
389            (
390                "Numeric",
391                lambda codes: codes.str.match(r"[0-9]+$"),
392                lambda codes, codes_file: self.raise_exception(
393                    InvalidCodesException(
394                        CodesError(
395                            f"QA Numeric",
396                            codes=codes,
397                            codes_file=codes_file,
398                            mask=codes.str.match(r"[0-9]+$"),
399                            code_type=self.name,
400                        )
401                    )
402                ),
403            ),
404            # (
405            # 	"Is Integer",
406            # 	lambda codes : codes.dtype == int,
407            # 	lambda codes : codes.astype(int) #Convert to integer
408            # ),
409            (
410                "In Database",
411                lambda codes: self.in_database(codes, self.db, self.name),
412                lambda codes, codes_file: self.raise_exception(
413                    InvalidCodesException(
414                        CodesError(
415                            f"QA In Database",
416                            codes=codes,
417                            codes_file=codes_file,
418                            mask=self.in_database(codes, self.db, self.name),
419                            code_type=self.name,
420                        )
421                    )
422                ),
423            ),
424        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Opcs4(Proto):
427class Opcs4(Proto):
428    def __init__(self):
429        super().__init__("opcs4", trud.PROCESSED_PATH / "opcs4.parquet")
430
431        self.checks = [
432            (
433                "Not Empty",
434                lambda codes: pd.Series([len(codes) > 0]),
435                lambda codes, codes_file: self.raise_exception(
436                    InvalidCodesException(
437                        CodesError(
438                            f"Code list is empty",
439                            codes=codes,
440                            codes_file=codes_file,
441                            mask=None,
442                            code_type=self.name,
443                        )
444                    )
445                ),
446            ),
447            (
448                "In Database",
449                lambda codes: self.in_database(codes, self.db, self.name),
450                lambda codes, codes_file: self.raise_exception(
451                    InvalidCodesException(
452                        CodesError(
453                            f"QA In Database",
454                            codes=codes,
455                            codes_file=codes_file,
456                            mask=self.in_database(codes, self.db, self.name),
457                            code_type=self.name,
458                        )
459                    )
460                ),
461            ),
462        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Atc(Proto):
465class Atc(Proto):
466    def __init__(self):
467        super().__init__("atc", trud_codes_path=None)
468        self.checks = [
469            (
470                "Not Empty",
471                lambda codes: pd.Series([len(codes) > 0]),
472                lambda codes, codes_file: self.raise_exception(
473                    InvalidCodesException(
474                        CodesError(
475                            f"Code list is empty",
476                            codes=codes,
477                            codes_file=codes_file,
478                            mask=None,
479                            code_type=self.name,
480                        )
481                    )
482                ),
483            ),
484            (
485                "Alphanumeric Capital",
486                lambda codes: codes.str.match(r"^[A-Z0-9]+$"),
487                lambda codes, codes_file: self.raise_exception(
488                    InvalidCodesException(
489                        CodesError(
490                            f"QA Alphanumeric Capital",
491                            codes=codes,
492                            codes_file=codes_file,
493                            mask=codes.str.match(r"^[A-Z0-9]+$"),
494                            code_type=self.name,
495                        )
496                    )
497                ),
498            ),
499        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Med(Proto):
502class Med(Proto):
503    def __init__(self):
504        super().__init__("med", trud_codes_path=None)
505        self.checks = [
506            (
507                "Not Empty",
508                lambda codes: pd.Series([len(codes) > 0]),
509                lambda codes, codes_file: self.raise_exception(
510                    InvalidCodesException(
511                        CodesError(
512                            f"Code list is empty",
513                            codes=codes,
514                            codes_file=codes_file,
515                            mask=None,
516                            code_type=self.name,
517                        )
518                    )
519                ),
520            )
521        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Cprd(Proto):
524class Cprd(Proto):
525    def __init__(self):
526        super().__init__("cprd", trud_codes_path=None)
527        self.checks = [
528            (
529                "Not Empty",
530                lambda codes: pd.Series([len(codes) > 0]),
531                lambda codes, codes_file: self.raise_exception(
532                    InvalidCodesException(
533                        CodesError(
534                            f"Code list is empty",
535                            codes=codes,
536                            codes_file=codes_file,
537                            mask=None,
538                            code_type=self.name,
539                        )
540                    )
541                ),
542            )
543        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class CodeTypeParser:
546class CodeTypeParser:
547    """A class used in InvalidCodesException to report an error if a code parser check fails"""
548
549    def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH):
550        if not trud_processed_dir.exists() or not trud_processed_dir.is_dir():
551            raise FileNotFoundError(
552                f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install"
553            )
554
555        self.code_types = {
556            "read2": Read2(),
557            "read3": Read3(),
558            "icd10": Icd10(),
559            "snomed": Snomed(),
560            "opcs4": Opcs4(),
561            "atc": Atc(),
562            "med": Med(),
563            "cprd": Cprd(),
564        }

A class used in InvalidCodesException to report an error if a code parser check fails

CodeTypeParser(trud_processed_dir: pathlib.Path = PosixPath('vocab/trud/processed'))
549    def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH):
550        if not trud_processed_dir.exists() or not trud_processed_dir.is_dir():
551            raise FileNotFoundError(
552                f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install"
553            )
554
555        self.code_types = {
556            "read2": Read2(),
557            "read3": Read3(),
558            "icd10": Icd10(),
559            "snomed": Snomed(),
560            "opcs4": Opcs4(),
561            "atc": Atc(),
562            "med": Med(),
563            "cprd": Cprd(),
564        }
code_types