diff --git a/acmc/base.py b/acmc/base.py deleted file mode 100644 index ea14fd04ea9af9d5306798ff945de3a09727d3cb..0000000000000000000000000000000000000000 --- a/acmc/base.py +++ /dev/null @@ -1,39 +0,0 @@ -import pandas as pd -import os - -from acmc import logging_config -logger = logging_config.setup_logger() - -class bcolors: #for printing coloured text - HEADER = '\033[95m' - OKBLUE = '\033[94m' - OKCYAN = '\033[96m' - OKGREEN = '\033[92m' - WARNING = '\033[93m' - FAIL = '\033[91m' - ENDC = '\033[0m' - BOLD = '\033[1m' - UNDERLINE = '\033[4m' - -def raise_(ex): - raise ex - -def log_invalid_code(codes, mask, code_type=None, error_file='build/error.csv', file_path=None, cause=None): - # print("ERROR WITH CODES", file_path, codes[~mask]) - - errors = pd.DataFrame([]) - errors["CONCEPT"] = codes[~mask].astype(str) - errors["VOCABULARY"] = code_type - errors["SOURCE"] = file_path - errors["CAUSE"] = cause - - #append to error log csv - if os.path.exists(error_file): - df_error = pd.read_csv(error_file) - df_error = pd.concat([df_error, errors]) - df_error.to_csv(error_file, index=False) - else: - df_error = errors - df_error.to_csv(error_file, index=False) - - return codes[mask] \ No newline at end of file diff --git a/acmc/logging_config.py b/acmc/logging_config.py index b099ce458a5140d619d5b64cb1f3e7c2db4a1a98..5af1bc6a47b85b622b039b37950236d7057e3b02 100644 --- a/acmc/logging_config.py +++ b/acmc/logging_config.py @@ -1,8 +1,22 @@ +import pandas as pd import logging DEFAULT_LOG_FILE = "acmc.log" +# TODO: Determine if bcolours is still needed considering use of logging not print +class bcolors: #for printing coloured text + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKCYAN = '\033[96m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + def setup_logger(log_level=logging.INFO): + """Sets up logger as a singleton outputing to file and sysout syserr""" # Create a logger logger = logging.getLogger('acmc_logger') logger.setLevel(logging.INFO) @@ -30,6 +44,7 @@ def setup_logger(log_level=logging.INFO): return logger def set_log_level(log_level): + """Sets the log level for the acmc logger""" logger = logging.getLogger('acmc_logger') logger.setLevel(log_level) # Set logger level diff --git a/acmc/main.py b/acmc/main.py index a78180cefcef618a18e71ae54980685121c91261..d1591b58908e5ec74b1bf90e25c7d1e6e697bd59 100644 --- a/acmc/main.py +++ b/acmc/main.py @@ -38,8 +38,7 @@ def phen_map(args): """Handle the `phen map` command.""" phen.map(args.phen_dir, args.target_coding, - args.translate, - args.verify) + args.translate) def phen_publish(args): """Handle the `phen publish` command.""" @@ -117,7 +116,6 @@ def main(): phen_map_parser.add_argument("-t", "--target-coding", required=True, choices=['read2', 'read3', 'icd10', 'snomed', 'opcs4'], help="Specify the target coding (read2, read3, icd10, snomed, opcs4)") # phen map flags phen_map_parser.add_argument("-tr", "--translate", action="store_true", default=False, help="Translate code types") - phen_map_parser.add_argument("-ve", "--verify", action="store_true", default=False, help="Verify codes") phen_map_parser.set_defaults(func=phen_map) # phen publish @@ -143,7 +141,7 @@ def main(): # setup logging if(args.debug): - logging_config.set_log_level(logging.DEBUG) + lc.set_log_level(logging.DEBUG) # Call the function associated with the command args.func(args) diff --git a/acmc/parse.py b/acmc/parse.py index b5fc34a31f7e6d6583d0c33c080aa83ca7a9cba5..349ab01c53cc0950188ba65f39c13bd8f0f451d6 100644 --- a/acmc/parse.py +++ b/acmc/parse.py @@ -4,13 +4,22 @@ import os # acmc imports from acmc import trud, logging_config as lc -from acmc.base import log_invalid_code, bcolors, raise_ +# setup logging logger = lc.setup_logger() -def in_database(codes, db, col): - return codes.isin(db[col]) +PHEN_CODE_ERROR_FILE = "code_errors.csv" +class InvalidCodesException(Exception): + """Custom exception class raised when invalid codes are found that cannot be resolved by processing""" + def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None): + super().__init__(message) + + # initialise class variables with provided parameters + for key, value in locals().items(): + if key != "self": + setattr(self, key, value) + class Proto(): """ Define checks as list of 3 tuple: (Message, Condition, Process) @@ -22,196 +31,226 @@ class Proto(): ( "Not Empty", lambda codes : pd.Series([len(codes) > 0]), #Should be true if passed - lambda codes : raise_(Exception("Code List is Empty")) #correts code, or logs, or throws error + lambda codes, codes_file : self.raise_exception(Exception(f"Code list is empty {codes_file}")) #correts code, or logs, or throws error ) ] - def __init__(self, file_path=None): - self.file_path = file_path - - def process(self, codes): - """ - identify issues that do not pass and fix them with defined process - """ - # codes = codes.dropna() - for msg, cond, process in self.checks: #run each check - if not cond(codes).all(): #if test failed - # print("Check:", msg, bcolors.FAIL+"FAILED"+bcolors.ENDC) - logger.error(f"Check: {msg} {(~cond(codes)).sum()} FAILED") - codes = process(codes) #run process to fix issue - if cond(codes).all(): #is resloved by process - logger.debug("Check:", msg, "is resolved") - else: #not resolved by process - raise Exception(f"ERROR: Check {msg} is NOT resolved") + def __init__(self, name, trud_codes_path=None): + if trud_codes_path is not None: + if trud_codes_path.is_file(): + self.trud_codes_path = trud_codes_path + self.db = pd.read_parquet(self.trud_codes_path) else: - logger.debug("Check: PASSED") - return codes + raise FileNotFoundError(f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly") + + self.name = name + + def raise_exception(self, ex): + """ Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict""" + raise ex + + def in_database(self, codes, db, col): + return codes.isin(db[col]) + + def process(self, codes, codes_file, ignore_errors=False): + """ identify issues that do not pass and fix them with define/d process """ + errors = [] + logger.debug("IN PROCESS") + # Iter through each item in check. + for msg, cond, fix in self.checks: + # Check if any codes fail the check to False + if not cond(codes).all(): + # Log the number of codes that failed + logger.debug(f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix") + # try fix errors by running lamba "process" function + try: + codes = fix(codes, codes_file) + logger.debug(f"Check: Fixed") + except InvalidCodesException as ex: + if ignore_errors: + errors.append(ex) + else: + raise ex + else: + logger.debug(f"Check: passed") + + return codes, errors - def verify(self, codes): - """ - verify all identified issues pass - """ + def verify(self, codes, codes_file): + """ verify codes in codes file """ conds = np.array([]) - for msg, cond, process in self.checks: #run each check + logger.debug("IN VERIFY") + + logger.debug(codes_file) + logger.debug(f"TYPE {type(codes)}") + logger.debug(codes) + # Iter through each item in check. + for msg, cond, process in self.checks: + # run conditional check out = cond(codes) conds = np.append(conds, out.all()) - if not out.all(): #if test failed - logger.error(f"Verify: {msg} FAILED") - logger.error(codes[out]) #show failed codes - - if conds.all(): #check all have passed - logger.debug(f"Verify: ALL PASSED") - return True - else: #not all have passed - logger.error(f"Verify: {(len(conds) - conds.sum())} FAILED") - return False - + + return codes + class Read2(Proto): - def __init__(self, file_path=None): - super().__init__(file_path) + """ This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary.""" + def __init__(self): + super().__init__('read2', trud.TRUD_PROCESSED_DIR / 'read2.parquet') - input_path = trud.TRUD_PROCESSED_DIR / 'read2.parquet' - if not input_path.is_file(): - raise FileNotFoundError(f"Error: Read2 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") - self.db = pd.read_parquet(input_path) - - self.checks = [ + # validate checks + self.checks = [ ( + # check codes are not empty, if empty throw an exception "Not Empty", lambda codes : pd.Series([len(codes) > 0]), - lambda codes : raise_(Exception("Code List is Empty")) + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"Code list is empty {codes_file}", + codes=codes, + codes_file=codes_file, + mask=None, + code_type=self.name)) ), ( + # check codes <5 characters, if too short pads it with . (dots) to reach 5 characters "Too Short", lambda codes : ~(codes.str.len() < 5), - lambda codes : codes.str.pad(width=5, side='right', fillchar='.') + lambda codes, codes_file : codes.str.pad(width=5, side='right', fillchar='.') ), ( + # check codes > 5 characters, If too long, truncates them to 5 characters "Too Long", lambda codes : ~(codes.str.len() > 5), - lambda codes : codes.str[:5] + lambda codes, codes_file : codes.str[:5] ), ( + # checks codes contain numbers, or dots (.), if not logs invalid code error "Alphanumeric Dot", - lambda codes : codes.str.match("^[a-zA-Z0-9\.]+$"), - lambda codes : log_invalid_code(codes, - codes.str.match("^[a-zA-Z0-9\.]+$"), #Log non-matching rows - code_type="read2", - file_path=self.file_path, - cause="QA Alphanumeric Dot"), + lambda codes : codes.str.match(r"^[a-zA-Z0-9.]+$"), + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"Illegal code format, not alphanumeric dot: {codes_file}", + codes=codes, + codes_file=codes_file, + mask=codes.str.match(r"^[a-zA-Z0-9.]+$"), + code_type=self.name)) ), ( + # checks code exists in self.db (the Read2 dataset). If missing log invalid codes. "In Database", - lambda codes : in_database(codes, self.db, "read2"), - lambda codes : log_invalid_code(codes, - in_database(codes, self.db, "read2"), #Log non-matching rows - code_type="read2", - file_path=self.file_path, - cause="QA In Database"), + lambda codes : self.in_database(codes, self.db, self.name), + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"Codes do not exist in database {codes_file}", + codes=codes, + codes_file=codes_file, + mask=self.in_database(codes, self.db, self.name), + code_type=self.name)) ), - ] class Read3(Proto): - def __init__(self, file_path=None): - super().__init__(file_path) - - input_path = trud.TRUD_PROCESSED_DIR / 'read3.parquet' - if not input_path.is_file(): - raise FileNotFoundError(f"Error: Read3 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") - self.db = pd.read_parquet(input_path) + def __init__(self): + super().__init__('Read3', trud.TRUD_PROCESSED_DIR / 'read3.parquet') self.checks = [ ( "Not Empty", lambda codes : pd.Series([len(codes) > 0]), - lambda codes : raise_(Exception("Code List is Empty")) + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"Code list is empty {codes_file}", + codes=codes, + codes_file=codes_file, + mask=None, + code_type=self.name)) ), ( "Too Short", lambda codes : ~(codes.str.len() < 5), - lambda codes : codes.str.pad(width=5, side='right', fillchar='.') + lambda codes, codes_file : codes.str.pad(width=5, side='right', fillchar='.') ), ( "Too Long", lambda codes : ~(codes.str.len() > 5), - lambda codes : codes.str[:5] + lambda codes, codes_file : codes.str[:5] ), ( "Alphanumeric Dot", - lambda codes : codes.str.match("^[a-zA-Z0-9\.]+$"), - lambda codes : log_invalid_code(codes, - codes.str.match("^[a-zA-Z0-9\.]+$"), #Log non-matching rows - code_type="read3", - file_path=self.file_path, - cause="QA Alphanumeric Dot"), + lambda codes : codes.str.match(r"^[a-zA-Z0-9.]+$"), + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"QA Alphanumeric Dot", + codes=codes, + codes_file=codes_file, + check_regex=codes.str.match(r"^[a-zA-Z0-9.]+$"), + code_type=self.name)) ), ( "In Database", - lambda codes : in_database(codes, self.db, "read3"), - lambda codes : log_invalid_code(codes, - in_database(codes, self.db, "read3"), #Log non-matching rows - code_type="read3", - file_path=self.file_path, - cause="QA In Database"), + lambda codes : self.in_database(codes, self.db, self.name), + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"QA In Database", + codes=codes, + codes_file=codes_file, + check_regex=self.in_database(codes, self.db, self.name), + code_type=self.name)) ), ] class Icd10(Proto): - def __init__(self, file_path=None): - super().__init__(file_path) - - input_path = trud.TRUD_PROCESSED_DIR / 'icd10.parquet' - if not input_path.is_file(): - raise FileNotFoundError(f"Error: ICD10 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") - self.db = pd.read_parquet(input_path) + def __init__(self): + super().__init__('icd10', trud.TRUD_PROCESSED_DIR / 'icd10.parquet') + self.checks = [ ( "Not Empty", lambda codes : pd.Series([len(codes) > 0]), - lambda codes : raise_(Exception("Code List is Empty")) + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"Code list is empty {codes_file}", + codes=codes, + codes_file=codes_file, + mask=None, + code_type=self.name)) ), ( "Too Short", lambda codes : ~(codes.str.len() < 3), - lambda codes : log_invalid_code(codes, - ~(codes.str.len() < 3), #Log non-matching rows - code_type="icd10", - file_path=self.file_path, - cause="QA Too Short"), + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"QA Too Short", + codes=codes, + codes_file=codes_file, + mask=~(codes.str.len() < 3), + code_type=self.name)) ), ( "Has Dot", - lambda codes : ~(codes.str.contains('\.')), #check if contrains dot - lambda codes : codes.str.replace(".", "") #delete any dots in string + lambda codes : ~(codes.str.match(r".*\..*")), #check if contains dot + lambda codes, codes_file : codes.str.replace(".", "") #delete any dots in string # lambda codes : codes.str.split('\.').apply(lambda ls: ls[0]) #only get part before dot ), ( "Alphanumeric Capital", - lambda codes : codes.str.match("^[A-Z0-9]+$"), - lambda codes : log_invalid_code(codes, - codes.str.match("^[A-Z0-9]+$"), #Log non-matching rows - code_type="icd10", - file_path=self.file_path, - cause="QA Alphanumeric Capital"), + lambda codes : codes.str.match(r"^[A-Z0-9]+$"), + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"QA Alphanumeric Capital", + codes=codes, + codes_file=codes_file, + mask=codes.str.match(r"^[A-Z0-9]+$"), + code_type=self.name)) ), ( "In Database", - lambda codes : ~(~in_database(codes, self.db, "icd10") & ~in_database(codes, self.db, "icd10_alt")), - lambda codes : log_invalid_code(codes, #Log non-matching rows - ~(~in_database(codes, self.db, "icd10") & ~in_database(codes, self.db, "icd10_alt")), - code_type="icd10", - file_path=self.file_path, - cause="QA In Database"), + lambda codes : ~(~self.in_database(codes, self.db, self.name) & ~self.in_database(codes, self.db, self.name + "_alt")), + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"QA In Database", + codes=codes, + codes_file=codes_file, + mask=~(~self.in_database(codes, self.db, self.name) & ~self.in_database(codes, self.db, self.name+"_alt")), + code_type=self.name)) ) # ( # "ICD10 Regex", # lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum* -# lambda codes : log_invalid_code(codes, +# lambda codes : lc.log_invalid_code(codes, # codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows # code_type="icd10", -# file_path=self.file_path), - +# # ) ] @@ -221,45 +260,44 @@ class Icd10(Proto): class Snomed(Proto): - def __init__(self, file_path=None): - super().__init__(file_path) - - input_path = trud.TRUD_PROCESSED_DIR / 'snomed.parquet' - if not input_path.is_file(): - raise FileNotFoundError(f"Error: SNOMED code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") - self.db = pd.read_parquet(input_path) + def __init__(self): + super().__init__('snomed', trud.TRUD_PROCESSED_DIR / 'snomed.parquet') + self.checks = [ # ( # "Not Empty", # lambda codes : pd.Series([len(codes) > 0]), - # lambda codes : raise_(Exception("Code List is Empty")) + # lambda codes : raise_exception(Exception("Code List is Empty")) # ), ( "Too Short", lambda codes : ~(codes.str.len() < 6), - lambda codes : log_invalid(codes, #Log non-matching rows - ~(codes.str.len() < 6), - code_type="snomed", - file_path=self.file_path, - cause="QA Too Short"), + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"QA Too Short", + codes=codes, + codes_file=codes_file, + mask=~(codes.str.len() < 6), + code_type=self.name)) ), ( "Too Long", lambda codes : ~(codes.str.len() > 18), - lambda codes : log_invalid_code(codes, #Log non-matching rows - ~(codes.str.len() > 18), - code_type="snomed", - file_path=self.file_path, - cause="QA Too Long"), + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"QA Too Long", + codes=codes, + codes_file=codes_file, + mask=~(codes.str.len() > 18), + code_type=self.name)) ), ( "Numeric", - lambda codes : codes.str.match("[0-9]+$"), - lambda codes : log_invalid_code(codes, #Log non-matching rows - codes.str.match("[0-9]+$"), - code_type="snomed", - file_path=self.file_path, - cause="QA Numeric"), + lambda codes : codes.str.match(r"[0-9]+$"), + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"QA Numeric", + codes=codes, + codes_file=codes_file, + mask=codes.str.match(r"[0-9]+$"), + code_type=self.name)) ), # ( # "Is Integer", @@ -268,79 +306,98 @@ class Snomed(Proto): # ), ( "In Database", - lambda codes : in_database(codes, self.db, "snomed"), - lambda codes : log_invalid_code(codes, #Log non-matching rows - in_database(codes, self.db, "snomed"), - code_type="snomed", - file_path=self.file_path, - cause="QA In Database"), + lambda codes : self.in_database(codes, self.db, self.name), + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"QA In Database", + codes=codes, + codes_file=codes_file, + mask=self.in_database(codes, self.db, self.name), + code_type=self.name)) ) ] class Opcs4(Proto): - def __init__(self, file_path=None): - super().__init__(file_path) - - input_path = trud.TRUD_PROCESSED_DIR / 'opcs4.parquet' - if not input_path.is_file(): - raise FileNotFoundError(f"Error: OPCS4 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly") - self.db = pd.read_parquet(input_path) + def __init__(self): + super().__init__('opcs4', trud.TRUD_PROCESSED_DIR / 'opcs4.parquet') + self.checks = [ ( "Not Empty", lambda codes : pd.Series([len(codes) > 0]), - lambda codes : raise_(Exception("Code List is Empty")) + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"Code list is empty {codes_file}", + codes=codes, + codes_file=codes_file, + mask=None, + code_type=self.name)) ), ( "In Database", - lambda codes : in_database(codes, self.db, "opcs4"), - lambda codes : log_invalid_code(codes, #Log non-matching rows - in_database(codes, self.db, "opcs4"), - code_type="opcs4", - file_path=self.file_path, - cause="QA In Database"), + lambda codes : self.in_database(codes, self.db, self.name), + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"QA In Database", + codes=codes, + codes_file=codes_file, + mask=self.in_database(codes, self.db, self.name), + code_type=self.name)) ) ] class Atc(Proto): - def __init__(self, file_path=None): - super().__init__(file_path) + def __init__(self): + super().__init__('atc', trud_codes_path=None) self.checks = [ ( "Not Empty", lambda codes : pd.Series([len(codes) > 0]), - lambda codes : raise_(Exception("Code List is Empty")) + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"Code list is empty", + codes=codes, + codes_file=codes_file, + mask=None, + code_type=self.name)) ), ( "Alphanumeric Capital", - lambda codes : codes.str.match("^[A-Z0-9]+$"), - lambda codes : log_invalid_code(codes, #Log non-matching rows - codes.str.match("^[A-Z0-9]+$"), - code_type="atc", - file_path=self.file_path, - cause="QA Alphanumeric Capital"), + lambda codes : codes.str.match(r"^[A-Z0-9]+$"), + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"QA Alphanumeric Capital", + codes=codes, + codes_file=codes_file, + mask=codes.str.match(r"^[A-Z0-9]+$"), + code_type=self.name)) ), ] class Med(Proto): - def __init__(self, file_path=None): - super().__init__(file_path) + def __init__(self): + super().__init__('med', trud_codes_path=None) self.checks = [ ( "Not Empty", lambda codes : pd.Series([len(codes) > 0]), - lambda codes : raise_(Exception("Code List is Empty")) + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"Code list is empty {codes_file}", + codes=codes, + codes_file=codes_file, + mask=None, + code_type=self.name)) ) ] class Cprd(Proto): - def __init__(self, file_path=None): - super().__init__(file_path) + def __init__(self): + super().__init__('cprd', trud_codes_path=None) self.checks = [ ( "Not Empty", lambda codes : pd.Series([len(codes) > 0]), - lambda codes : raise_(Exception("Code List is Empty")) + lambda codes, codes_file : self.raise_exception( + InvalidCodesException(f"Code list is empty {codes_file}", + codes=codes, + codes_file=codes_file, + mask=None, + code_type=self.name)) ) ] @@ -364,4 +421,34 @@ vocab_types = { "atc": "ATC", "med": None, "cprd": None, -} \ No newline at end of file +} + + +# def log_invalid_code(self, codes=None, codes_file=None, mask=None, code_type=None, error_file=None, cause=None): +# logger = logging.getLogger('acmc_logger') +# logger.error(f"Invalid codes {codes_file}, {codes[~mask]}") + +# errors = pd.DataFrame([]) +# errors["CONCEPT"] = codes[~mask].astype(str) +# errors["VOCABULARY"] = code_type +# errors["SOURCE"] = file_path +# errors["CAUSE"] = cause +# +# #append to error log csv +# if os.path.exists(error_file): +# df_error = pd.read_csv(error_file) +# df_error = pd.concat([df_error, errors]) +# df_error.to_csv(error_file, index=False) +# else: +# df_error = errors +# df_error.to_csv(error_file, index=False) + +# return codes[mask] + + # write erros to a file +# error_path = phen_path / ERROR_FILE +# if error_path.exists(): +# error_df = pd.read_csv(error_path) +# error_df = error_df.drop_duplicates() # Remove Duplicates from Error file +# error_df = error_df.sort_values(by=["SOURCE", "VOCABULARY", "CONCEPT"]) +# error_df.to_csv(error_path, index=False) \ No newline at end of file diff --git a/acmc/phen.py b/acmc/phen.py index 8204c75f48302567172e19853161fd5fcae79060..a068efe02734a27a84dc7069703739dd141c6067 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -15,7 +15,6 @@ from urllib.parse import urlparse, urlunparse # acmc imports from acmc import trud, omop -from acmc.base import log_invalid_code, bcolors, raise_ from acmc.parse import Read2, Read3, Icd10, Snomed, Opcs4, Atc, code_types, vocab_types from acmc.omop import publish_concept_sets, setup @@ -303,49 +302,51 @@ def read_table_file(path, excel_sheet=None): return df -def preprocess_code(out, codes, checker, output_col, df_meta, verify=True): - codes = codes.astype(str) # convert to string - codes = codes.str.strip() # remove excess spaces - if verify: - codes = checker.process(codes) # resolve any identified issues - if not checker.verify(codes): # verify all issues resolved - logger.error("ERROR: FAILED") - # add metadata columns - out = pd.concat( - [out, pd.DataFrame({output_col: codes}).join(df_meta)], ignore_index=True - ) +def preprocess_code(out, codes, codes_file, checker, output_col, df_meta, verify=True): + logger.debug(f" Preprocess_code") + codes = codes.astype(str) # convert to string + codes = codes.str.strip() # remove excess spaces + logger.debug(f" CODE TYPE IN PREPROCESS {type(codes)}") + if verify: + codes, errors = checker.process(codes, codes_file) # resolve any identified issues + if len(errors) > 0: + raise Exception(f"Code validation failed with {len(errors)} errors") + # add metadata columns + out = pd.concat([out, pd.DataFrame({output_col: codes}).join(df_meta)], ignore_index=True) - return out + return out # Perform QA Checks on columns individually and append to df -def preprocess(df, columns, target_code_type=None, meta_columns=[], file_path=None, verify=True, translate=True,): - """ Parses each column individually - Order and length will not be preserved! """ - out = pd.DataFrame([]) # create output df to append to - if target_code_type and not translate: - # QA only on target codes - if target_code_type in columns: - logger.info(f"Processing {target_code_type} Codes...") - out = preprocess_code(out=out, +def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=None, verify=True, translate=True,): + """ Parses each column individually - Order and length will not be preserved! """ + out = pd.DataFrame([]) # create output df to append to + logger.debug(f"CODES file {codes_file}") + if target_code_type and not translate: + # QA only on target codes + if target_code_type in columns: + logger.info(f"Processing {target_code_type} Codes...") + out = preprocess_code(out=out, codes=df[columns[target_code_type]].dropna(), + codes_file=codes_file, checker=code_types[target_code_type](file_path), output_col=target_code_type, df_meta=df[meta_columns], verify=verify,) - else: - logger.warning(f"No {target_code_type} Codes to process") - else: - # QA for every code type in df run preprocess_code() - for k, v in code_types.items(): - if k in columns: - logger.info(f"Processing {k} Codes...") - out = preprocess_code(out=out, + else: + logger.warning(f"No {target_code_type} Codes to process") + else: + # QA for every code type in df run preprocess_code() + for k, v in code_types.items(): + if k in columns: + logger.info(f"Processing {k} Codes...") + out = preprocess_code(out=out, codes=df[columns[k]].dropna(), - checker=v(file_path), + codes_file=codes_file, + checker=v(), output_col=k, df_meta=df[meta_columns], verify=verify,) - - return out + return out # Translate Df with multiple codes into single code type Series def convert_codes(df, target, translate): @@ -434,14 +435,14 @@ def map(phen_dir, target_code_type, translate=True, verify=True): logger.debug(folder["description"]) if "files" in folder: for file in folder["files"]: - logger.debug("---" * 5, file["file"], "---" * 5) - file_path = codes_path / folder["folder"] / file["file"] + logger.debug(f"--- {file["file"]} ---") + codes_file_path = codes_path / folder["folder"] / file["file"] # Load Code File if "excel_sheet" in file: - df = read_table_file(path=file_path, excel_sheet=file["excel_sheet"]) + df = read_table_file(path=codes_file_path, excel_sheet=file["excel_sheet"]) else: - df = read_table_file(path=file_path) + df = read_table_file(path=codes_file_path) # Perform Structural Changes to file before preprocessing # split column with multiple code types @@ -467,7 +468,7 @@ def map(phen_dir, target_code_type, translate=True, verify=True): df = preprocess(df, file["columns"], meta_columns=meta_columns, - file_path=file_path, + codes_file=str(codes_file_path.resolve()), target_code_type=target_code_type, verify=verify, translate=translate) diff --git a/acmc/trud.py b/acmc/trud.py index 82c30735c1ad1e6d927bd85d84807b05095c9b6a..bd35553eb819700373be2317157d8f778faf0ea4 100644 --- a/acmc/trud.py +++ b/acmc/trud.py @@ -10,8 +10,6 @@ import pandas as pd import simpledbf from pathlib import Path -from acmc.base import bcolors - # setup logging import acmc.logging_config as lc logger = lc.setup_logger()