diff --git a/acmc/phen.py b/acmc/phen.py index 5b81bc38912e7d068e5b682f8c098749521f741a..bc0d7e52abdd4acdf54611952d29f8f42db1a987 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -14,8 +14,7 @@ from pathlib import Path from urllib.parse import urlparse, urlunparse # acmc imports -from acmc import trud, omop -from acmc.parse import code_types +from acmc import trud, omop, parse from acmc.omop import publish_concept_sets, setup # setup logging @@ -258,7 +257,7 @@ def validate(phen_dir): # check columns specified are a supported medical coding type for column in file['columns']: - if column not in code_types and column != 'metadata': + if column not in parse.code_types and column != 'metadata': validation_errors.append(f"Column type {column} for file {concept_code_file_path} is not supported") # check the actions are supported @@ -328,6 +327,28 @@ def process_actions(df, file): return df +def log_invalid_code(codes, mask, code_type=None, file_path=None, cause=None): + # print("ERROR WITH CODES", file_path, codes[~mask]) + + errors = pd.DataFrame([]) + errors["CONCEPT"] = codes[~mask].astype(str) + errors["VOCABULARY"] = code_type + errors["SOURCE"] = file_path + errors["CAUSE"] = cause + + #append to error log csv + if os.path.exists(log_errors_path): + print("FILE EXISTS") + df_error = pd.read_csv(log_errors_path) + df_error = pd.concat([df_error, errors]) + df_error.to_csv(log_errors_path, index=False) + else: + print("FILE NOT EXIST") + df_error = errors + df_error.to_csv(log_errors_path, index=False) + + return codes[mask] + def preprocess_code(out, codes, codes_file, checker, output_col, metadata_df): # preprocess codes @@ -338,6 +359,7 @@ def preprocess_code(out, codes, codes_file, checker, output_col, metadata_df): if len(errors) > 0: raise Exception(f"Code validation failed with {len(errors)} errors") + # add metadata columns out = pd.concat([out, pd.DataFrame({output_col: codes}).join(metadata_df)], ignore_index=True) @@ -362,14 +384,14 @@ def preprocess(df, file, target_code_type=None, codes_file=None, translate=True, out = preprocess_code(out=out, codes=df[file[columns][target_code_type]].dropna(), codes_file=codes_file, - checker=code_types[target_code_type](file_path), + checker=parse.code_types[target_code_type](), output_col=target_code_type, metadata_df=df[meta_columns]) else: logger.warning(f"No {target_code_type} Codes to process") else: # QA for every code type in df run preprocess_code() - for k, v in code_types.items(): + for k, v in parse.code_types.items(): if k in file['columns']: logger.info(f"Processing {k} Codes...") out = preprocess_code(out=out,