From 09d928c9aeef8f4ace208ad86f176eee0f6fc7b9 Mon Sep 17 00:00:00 2001 From: Michael Boniface <m.j.boniface@soton.ac.uk> Date: Thu, 20 Feb 2025 12:48:00 +0000 Subject: [PATCH] added logging errors back in phen code, and not nested in the lambda function of parse --- acmc/parse.py | 95 ++++++++++++++++++++++++++------------------------- acmc/phen.py | 53 ++++++++++++---------------- 2 files changed, 71 insertions(+), 77 deletions(-) diff --git a/acmc/parse.py b/acmc/parse.py index b1a5414..7c470db 100644 --- a/acmc/parse.py +++ b/acmc/parse.py @@ -10,15 +10,18 @@ logger = lc.setup_logger() PHEN_CODE_ERROR_FILE = "code_errors.csv" -class InvalidCodesException(Exception): - """Custom exception class raised when invalid codes are found that cannot be resolved by processing""" +class CodesError(): def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None): - super().__init__(message) - # initialise class variables with provided parameters for key, value in locals().items(): if key != "self": setattr(self, key, value) + +class InvalidCodesException(Exception): + """Custom exception class raised when invalid codes are found that cannot be resolved by processing""" + def __init__(self, error): + super().__init__(error.message) + self.error = error class Proto(): """ @@ -66,7 +69,7 @@ class Proto(): codes = fix(codes, codes_file) logger.debug(f"Check: Fixed") except InvalidCodesException as ex: - errors.append(ex) + errors.append(ex.error) else: logger.debug(f"Check: passed") @@ -96,11 +99,11 @@ class Read2(Proto): "Not Empty", lambda codes : pd.Series([len(codes) > 0]), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"Code list is empty {codes_file}", + InvalidCodesException(CodesError(f"Code list is empty", codes=codes, codes_file=codes_file, mask=None, - code_type=self.name)) + code_type=self.name))) ), ( # check codes <5 characters, if too short pads it with . (dots) to reach 5 characters @@ -119,23 +122,23 @@ class Read2(Proto): "Alphanumeric Dot", lambda codes : codes.str.match(r"^[a-zA-Z0-9.]+$"), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"Illegal code format, not alphanumeric dot: {codes_file}", + InvalidCodesException(CodesError(f"Illegal code format, not alphanumeric dot", codes=codes, codes_file=codes_file, mask=codes.str.match(r"^[a-zA-Z0-9.]+$"), - code_type=self.name)) + code_type=self.name))) ), ( # checks code exists in self.db (the Read2 dataset). If missing log invalid codes. "In Database", lambda codes : self.in_database(codes, self.db, self.name), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"Codes do not exist in database {codes_file}", + InvalidCodesException(CodesError(f"Codes do not exist in database", codes=codes, codes_file=codes_file, mask=self.in_database(codes, self.db, self.name), - code_type=self.name)) - ), + code_type=self.name))) + ) ] class Read3(Proto): @@ -147,11 +150,11 @@ class Read3(Proto): "Not Empty", lambda codes : pd.Series([len(codes) > 0]), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"Code list is empty {codes_file}", + InvalidCodesException(CodesError(f"Code list is empty", codes=codes, codes_file=codes_file, mask=None, - code_type=self.name)) + code_type=self.name))) ), ( "Too Short", @@ -167,21 +170,21 @@ class Read3(Proto): "Alphanumeric Dot", lambda codes : codes.str.match(r"^[a-zA-Z0-9.]+$"), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"QA Alphanumeric Dot", + InvalidCodesException(CodesError(f"QA Alphanumeric Dot", codes=codes, codes_file=codes_file, check_regex=codes.str.match(r"^[a-zA-Z0-9.]+$"), - code_type=self.name)) + code_type=self.name))) ), ( "In Database", lambda codes : self.in_database(codes, self.db, self.name), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"QA In Database", + InvalidCodesException(CodesError(f"QA In Database", codes=codes, codes_file=codes_file, check_regex=self.in_database(codes, self.db, self.name), - code_type=self.name)) + code_type=self.name))) ), ] @@ -194,21 +197,21 @@ class Icd10(Proto): "Not Empty", lambda codes : pd.Series([len(codes) > 0]), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"Code list is empty {codes_file}", + InvalidCodesException(CodesError(f"Code list is empty {codes_file}", codes=codes, codes_file=codes_file, mask=None, - code_type=self.name)) + code_type=self.name))) ), ( "Too Short", lambda codes : ~(codes.str.len() < 3), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"QA Too Short", + InvalidCodesException(CodesError(f"QA Too Short", codes=codes, codes_file=codes_file, mask=~(codes.str.len() < 3), - code_type=self.name)) + code_type=self.name))) ), ( "Has Dot", @@ -220,21 +223,21 @@ class Icd10(Proto): "Alphanumeric Capital", lambda codes : codes.str.match(r"^[A-Z0-9]+$"), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"QA Alphanumeric Capital", + InvalidCodesException(CodesError(f"QA Alphanumeric Capital", codes=codes, codes_file=codes_file, mask=codes.str.match(r"^[A-Z0-9]+$"), - code_type=self.name)) + code_type=self.name))) ), ( "In Database", lambda codes : ~(~self.in_database(codes, self.db, self.name) & ~self.in_database(codes, self.db, self.name + "_alt")), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"QA In Database", + InvalidCodesException(CodesError(f"QA In Database", codes=codes, codes_file=codes_file, mask=~(~self.in_database(codes, self.db, self.name) & ~self.in_database(codes, self.db, self.name+"_alt")), - code_type=self.name)) + code_type=self.name))) ) # ( # "ICD10 Regex", @@ -265,31 +268,31 @@ class Snomed(Proto): "Too Short", lambda codes : ~(codes.str.len() < 6), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"QA Too Short", + InvalidCodesException(CodesError(f"QA Too Short", codes=codes, codes_file=codes_file, mask=~(codes.str.len() < 6), - code_type=self.name)) + code_type=self.name))) ), ( "Too Long", lambda codes : ~(codes.str.len() > 18), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"QA Too Long", + InvalidCodesException(CodesError(f"QA Too Long", codes=codes, codes_file=codes_file, mask=~(codes.str.len() > 18), - code_type=self.name)) + code_type=self.name))) ), ( "Numeric", lambda codes : codes.str.match(r"[0-9]+$"), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"QA Numeric", + InvalidCodesException(CodesError(f"QA Numeric", codes=codes, codes_file=codes_file, mask=codes.str.match(r"[0-9]+$"), - code_type=self.name)) + code_type=self.name))) ), # ( # "Is Integer", @@ -300,11 +303,11 @@ class Snomed(Proto): "In Database", lambda codes : self.in_database(codes, self.db, self.name), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"QA In Database", + InvalidCodesException(CodesError(f"QA In Database", codes=codes, codes_file=codes_file, mask=self.in_database(codes, self.db, self.name), - code_type=self.name)) + code_type=self.name))) ) ] @@ -317,21 +320,21 @@ class Opcs4(Proto): "Not Empty", lambda codes : pd.Series([len(codes) > 0]), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"Code list is empty {codes_file}", + InvalidCodesException(CodesError(f"Code list is empty", codes=codes, codes_file=codes_file, mask=None, - code_type=self.name)) + code_type=self.name))) ), ( "In Database", lambda codes : self.in_database(codes, self.db, self.name), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"QA In Database", + InvalidCodesException(CodesError(f"QA In Database", codes=codes, codes_file=codes_file, mask=self.in_database(codes, self.db, self.name), - code_type=self.name)) + code_type=self.name))) ) ] @@ -343,21 +346,21 @@ class Atc(Proto): "Not Empty", lambda codes : pd.Series([len(codes) > 0]), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"Code list is empty", + InvalidCodesException(CodesError(f"Code list is empty", codes=codes, codes_file=codes_file, mask=None, - code_type=self.name)) + code_type=self.name))) ), ( "Alphanumeric Capital", lambda codes : codes.str.match(r"^[A-Z0-9]+$"), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"QA Alphanumeric Capital", + InvalidCodesException(CodesError(f"QA Alphanumeric Capital", codes=codes, codes_file=codes_file, mask=codes.str.match(r"^[A-Z0-9]+$"), - code_type=self.name)) + code_type=self.name))) ), ] @@ -369,11 +372,11 @@ class Med(Proto): "Not Empty", lambda codes : pd.Series([len(codes) > 0]), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"Code list is empty {codes_file}", + InvalidCodesException(CodesError(f"Code list is empty", codes=codes, codes_file=codes_file, mask=None, - code_type=self.name)) + code_type=self.name))) ) ] @@ -385,11 +388,11 @@ class Cprd(Proto): "Not Empty", lambda codes : pd.Series([len(codes) > 0]), lambda codes, codes_file : self.raise_exception( - InvalidCodesException(f"Code list is empty {codes_file}", + InvalidCodesException(CodesError(f"Code list is empty", codes=codes, codes_file=codes_file, mask=None, - code_type=self.name)) + code_type=self.name))) ) ] diff --git a/acmc/phen.py b/acmc/phen.py index df354e2..41c3520 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -328,28 +328,6 @@ def process_actions(df, file): return df -def log_invalid_code(codes, mask, code_type=None, file_path=None, cause=None): - # print("ERROR WITH CODES", file_path, codes[~mask]) - - errors = pd.DataFrame([]) - errors["CONCEPT"] = codes[~mask].astype(str) - errors["VOCABULARY"] = code_type - errors["SOURCE"] = file_path - errors["CAUSE"] = cause - - #append to error log csv - if os.path.exists(log_errors_path): - print("FILE EXISTS") - df_error = pd.read_csv(log_errors_path) - df_error = pd.concat([df_error, errors]) - df_error.to_csv(log_errors_path, index=False) - else: - print("FILE NOT EXIST") - df_error = errors - df_error.to_csv(log_errors_path, index=False) - - return codes[mask] - # Perform QA Checks on columns individually and append to df def preprocess_codes(df, file, target_code_type=None, codes_file=None): """ Parses each column individually - Order and length will not be preserved! """ @@ -378,8 +356,8 @@ def preprocess_codes(df, file, target_code_type=None, codes_file=None): # process codes, validating them using parser and returning the errors codes, errors = code_type_parser.process(codes, codes_file) if len(errors) > 0: - code_errors = code_errors.append(errors) - logger.warning(f"Code validation failed with {len(errors)} errors") + code_errors.extend(errors) + logger.warning(f"Codes validation failed with {len(errors)} errors") # add metadata columns out = pd.concat([out, pd.DataFrame({code_type_name: codes}).join(metadata_df)], ignore_index=True) @@ -443,6 +421,17 @@ def sql_row_exist(conn, table, column, value): return exists +def write_code_errors(code_errors, code_errors_path): + err_df = pd.DataFrame([ + {"CONCEPT": ", ".join(err.codes[~err.mask].tolist()), + "VOCABULARY": err.code_type, + "SOURCE": err.codes_file, + "CAUSE": err.message} for err in code_errors]) + + err_df = err_df.drop_duplicates() # Remove Duplicates from Error file + err_df = err_df.sort_values(by=["SOURCE", "VOCABULARY", "CONCEPT"]) + err_df.to_csv(code_errors_path, index=False, mode="w") + def map(phen_dir, target_code_type): logger.info(f"Processing phenotype: {phen_dir}") logger.debug(f"Target coding format: {target_code_type}") @@ -462,7 +451,7 @@ def map(phen_dir, target_code_type): # Create output dataframe out = pd.DataFrame([]) - code_errors [] + code_errors = [] # Process each folder in codes section for folder in codes: @@ -486,7 +475,10 @@ def map(phen_dir, target_code_type): file, codes_file=str(codes_file_path.resolve()), target_code_type=target_code_type) - code_errors = code_errors.append(errors) + logger.debug(f" Length of errors from preprocess {len(errors)}") + if len(errors) > 0: + code_errors.extend(errors) + logger.debug(f" Length of code_errors {len(code_errors)}") # partition table by categorical column if ("actions" in file and "divide_col" in file["actions"] and len(df) > 0): @@ -521,10 +513,13 @@ def map(phen_dir, target_code_type): logger.warning(f"File {file} has no output after preprocessing in config {str(config_path.resolve())}") if(len(code_errors) > 0): - logger.error(f"The map processing has {len(code_errors)} errors) + logger.error(f"The map processing has {len(code_errors)} errors") + error_filename = f"{target_code_type}-code-errors.csv" + write_code_errors(code_errors, phen_path / MAP_DIR / error_filename) # Check there is output from processing if len(out.index) == 0: + logger.error(f"No output after map processing") raise Exception(f"No output after map processing, check config {str(config_path.resolve())}") # Final processing @@ -543,9 +538,7 @@ def map(phen_dir, target_code_type): # Save output to map directory output_filename = target_code_type + '.csv' - map_path = phen_path / MAP_DIR / output_filename - out.to_csv(map_path, index=False) logger.info(f"Saved mapped concepts to {str(map_path.resolve())}") @@ -574,8 +567,6 @@ def map(phen_dir, target_code_type): shutil.copy(trud.VERSION_PATH, phen_path / trud.VERSION_FILE) shutil.copy(omop.VERSION_PATH, phen_path / omop.VERSION_FILE) - logger.debug(f"Saved concept_sets to {str(concept_set_path.resolve())}") - logger.info(f"Phenotype processed successfully") def publish(phen_dir): -- GitLab