diff --git a/acmc/phen.py b/acmc/phen.py index 282186c31a1c51802f9831772f43d2e23cc12765..c576e7faa0c3225eed9d73ab140a60b6afb13f7d 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -313,15 +313,33 @@ def read_table_file(path, excel_sheet=None): return df -def preprocess_code(out, codes, codes_file, checker, output_col, df_meta): +def process_actions(df, file): + # Perform Structural Changes to file before preprocessing + logger.debug("Processing file structural actions") + if ("actions" in file and "split_col" in file["actions"] and "codes_col" in file["actions"]): + split_col = file["actions"]["split_col"] + codes_col = file["actions"]["codes_col"] + logger.debug("Action: Splitting", split_col, "column into:", df[split_col].unique(),) + codes = df[codes_col] + oh = pd.get_dummies(df[split_col], dtype=bool) # one hot encode + oh = oh.where((oh != True), codes, axis=0) # fill in 1s with codes + oh[oh == False] = np.nan # replace 0s with None + df = pd.concat([df, oh], axis=1) # merge in new columns + + return df + +def preprocess_code(out, codes, codes_file, checker, output_col, metadata_df): + + # preprocess codes codes = codes.astype(str) # convert to string - codes = codes.str.strip() # remove excess spaces + codes = codes.str.strip() # remove excess spaces - codes, errors = checker.process(codes, codes_file) # resolve any identified issues + codes, errors = checker.process(codes, codes_file) if len(errors) > 0: raise Exception(f"Code validation failed with {len(errors)} errors") + # add metadata columns - out = pd.concat([out, pd.DataFrame({output_col: codes}).join(df_meta)], ignore_index=True) + out = pd.concat([out, pd.DataFrame({output_col: codes}).join(metadata_df)], ignore_index=True) return out @@ -339,7 +357,7 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=N codes_file=codes_file, checker=code_types[target_code_type](file_path), output_col=target_code_type, - df_meta=df[meta_columns]) + metadata_df=df[meta_columns]) else: logger.warning(f"No {target_code_type} Codes to process") else: @@ -352,7 +370,7 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=N codes_file=codes_file, checker=v(), output_col=k, - df_meta=df[meta_columns]) + metadata_df=df[meta_columns]) return out # Translate Df with multiple codes into single code type Series @@ -390,7 +408,7 @@ def convert_codes(df, target, translate): # Append file's codes to output Df with concept def map_file(df, target_code_type, out, concepts, meta_columns=[], translate=True): # seperate out meta_columns - df_meta = df[meta_columns] + metadata_df = df[meta_columns] df = df.drop(columns=meta_columns) codes = convert_codes(df, target_code_type, translate) codes = codes.dropna() # delete NaNs @@ -398,7 +416,7 @@ def map_file(df, target_code_type, out, concepts, meta_columns=[], translate=Tru # Append to out df if len(codes) > 0: codes = pd.DataFrame({"CONCEPT": codes}) - codes = codes.join(df_meta) + codes = codes.join(metadata_df) for concept in concepts: codes["CONCEPT_SET"] = np.repeat(concept.strip(), len(codes)) out = pd.concat([out, codes]) @@ -446,18 +464,8 @@ def map(phen_dir, target_code_type, translate=True): else: df = read_table_file(path=codes_file_path) - # Perform Structural Changes to file before preprocessing - # split column with multiple code types - logger.debug("Processing file structural actions") - if ("actions" in file and "split_col" in file["actions"] and "codes_col" in file["actions"]): - split_col = file["actions"]["split_col"] - codes_col = file["actions"]["codes_col"] - logger.debug("Action: Splitting", split_col, "column into:", df[split_col].unique(),) - codes = df[codes_col] - oh = pd.get_dummies(df[split_col], dtype=bool) # one hot encode - oh = oh.where((oh != True), codes, axis=0) # fill in 1s with codes - oh[oh == False] = np.nan # replace 0s with None - df = pd.concat([df, oh], axis=1) # merge in new columns + # process structural actions + df = process_actions(df, file) # Preprocessing & Validation Checks logger.debug("Processing and validating code formats")