From 5b3b1d293adea93dc620de14502a218a9ab46dc2 Mon Sep 17 00:00:00 2001 From: Michael Boniface <m.j.boniface@soton.ac.uk> Date: Thu, 27 Feb 2025 17:28:18 +0000 Subject: [PATCH] fix: started to add the metadata back in, the translation function is trickly as it's not clear currently how the joining of data frames and indexes actually works becaiuse it's not explicit. #28 --- acmc/phen.py | 138 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 81 insertions(+), 57 deletions(-) diff --git a/acmc/phen.py b/acmc/phen.py index 05ac0d1..4a48fd3 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -46,6 +46,8 @@ DIVIDE_COL_ACTION = "divide_col" COL_ACTIONS = [SPLIT_COL_ACTION, CODES_COL_ACTION, DIVIDE_COL_ACTION] CODE_FILE_TYPES = [".xlsx", ".xls", ".csv"] +SOURCE_COL_SUFFIX = "_acmc_source" +TARGET_COL_SUFFIX = "_acmc_target" # config.yaml schema CONFIG_SCHEMA = { @@ -470,31 +472,24 @@ def process_actions(df, concept_set): return df - # Perform QA Checks on columns individually and append to df -def preprocess_codes(df, concept_set, code_file_path, target_code_type=None): +def preprocess_source_concepts(df, concept_set, code_file_path): """Parses each column individually - Order and length will not be preserved!""" out = pd.DataFrame([]) # create output df to append to code_errors = [] # list of errors from processing - # TODO: Is there a better way of processing this action as it's distributed across - # different parts of the programme. - if ( - "actions" in concept_set["file"] - and "divide_col" in concept_set["file"]["actions"] - ): - divide_col_df = df[concept_set["file"]["actions"]["divide_col"]] - else: - divide_col_df = pd.DataFrame() - + # remove unnamed columns due to extra commas, missing headers, or incorrect parsing + df = df.drop(columns=[col for col in df.columns if "Unnamed" in col]) + # Preprocess codes code_types = parse.CodeTypeParser().code_types for code_type in concept_set["file"]["columns"]: parser = code_types[code_type] - logger.info(f"Processing {code_type} codes...") + logger.info(f"Processing {code_type} codes for {code_file_path}") - # get code types - codes = df[concept_set["file"]["columns"][code_type]].dropna() + # get codes by column name + source_col_name = concept_set["file"]["columns"][code_type] + codes = df[source_col_name].dropna() codes = codes.astype(str) # convert to string codes = codes.str.strip() # remove excess spaces @@ -503,63 +498,80 @@ def preprocess_codes(df, concept_set, code_file_path, target_code_type=None): if len(errors) > 0: code_errors.extend(errors) logger.warning(f"Codes validation failed with {len(errors)} errors") - - # append to output dataframe + + # add processed codes to df + new_col_name = f"{source_col_name}_SOURCE" + df = df.rename(columns={source_col_name: new_col_name}) + process_codes = pd.DataFrame({code_type: codes}).join(df) out = pd.concat( - [out, pd.DataFrame({code_type: codes}).join(divide_col_df)], + [out, process_codes], ignore_index=True, ) + + logger.debug(out.head()) + return out, code_errors +def get_code_type_from_col_name(col_name): + return col_name.split("_")[0] + # Translate Df with multiple codes into single code type Series -def translate_codes(df, target_code_type): +def translate_codes(df, source_code_types, target_code_type, concept_name): codes = pd.Series([], dtype=str) # Convert codes to target type logger.info(f"Converting to target code type {target_code_type}") - for col_name in df.columns: + + for source_code_type, source_code_column in source_code_types.items(): + # if target code type is the same as thet source code type, no translation, just appending source as target - if col_name == target_code_type: + if source_code_type == target_code_type: + codes = pd.concat([codes, df[source_code_type]]) logger.debug( - f"Target code type {target_code_type} has source code types {len(df)}, copying rather than translating" - ) - codes = pd.concat([codes, df[target_code_type]]) - else: - filename = f"{col_name}_to_{target_code_type}.parquet" + f"Target code type {target_code_type} is the same as source code type {len(df)}, copying codes rather than translating" + ) + else: + # get the translation filename using source to target code types + filename = f"{source_code_type}_to_{target_code_type}.parquet" map_path = trud.PROCESSED_PATH / filename + + # do the mapping if it exists if map_path.exists(): - col = df[col_name] - df_map = pd.read_parquet(map_path) + # get column from processed df and rename the series to what's needed for parquet + + col = df[source_code_type] + df_map = pd.read_parquet(map_path) # merge on corresponding codes and take target column - translated = pd.merge(col, df_map, how="left")[target_code_type] + translated_df = pd.merge(col, df_map, how="left")[target_code_type] + logger.debug("TRANSLATE") + logger.debug(translated_df.head()) + # TODO: BUG mask does not match column - codes = pd.concat([codes, translated]) # merge to output + codes = pd.concat([codes, translated_df]) + logger.debug("CODES") + logger.debug(codes.head()) + else: logger.warning( - f"No mapping from {col_name} to {target_code_type}, file {str(map_path.resolve())} does not exist" + f"No mapping from {source_code_type} to {target_code_type}, file {str(map_path.resolve())} does not exist" ) + logger.debug("FULL CONCATED") + logger.debug(codes.head()) - return codes - - -# Append file's codes to output Df with concept -def map_file(df, target_code_type, out, concept_name): - - # translate codes - codes = translate_codes(df, target_code_type) codes = codes.dropna() # delete NaNs + logger.debug(f"FULL CONCATED {len(codes.index)}") + # Append to output if translated - if len(codes) > 0: + if len(codes.index) > 0: codes = pd.DataFrame({"CONCEPT": codes}) - codes["CONCEPT_SET"] = np.repeat(concept_name.strip(), len(codes)) - out = pd.concat([out, codes]) + codes["CONCEPT_SET"] = np.repeat(concept_name.strip(), len(codes)) else: logger.debug(f"No codes converted with target code type {target_code_type}") - return out + return codes def sql_row_exist(conn, table, column, value): @@ -652,7 +664,7 @@ def map(phen_dir, target_code_type): f"Target code type {target_code_type} not in phenotype configuration map {phenotype['map']}" ) - if target_code_type is not None: + if target_code_type is not None: map_target_code_type(phen_path, phenotype, target_code_type) else: for t in phenotype["map"]: @@ -662,7 +674,6 @@ def map(phen_dir, target_code_type): def map_target_code_type(phen_path, phenotype, target_code_type): - logger.debug(f"Target coding format: {target_code_type}") codes_path = phen_path / CODES_DIR # Create output dataframe @@ -680,21 +691,20 @@ def map_target_code_type(phen_path, phenotype, target_code_type): # process structural actions df = process_actions(df, concept_set) - # Preprocessing & Validation Checks - logger.debug("Processing and validating code formats") - df, errors = preprocess_codes( + # preprocessing and validate of source concepts + logger.debug("Processing and validating source concept codes") + df, errors = preprocess_source_concepts( df, concept_set, codes_file_path, - target_code_type=target_code_type, ) - logger.debug(f"Length of errors from preprocess {len(errors)}") + logger.debug(f"Length of errors from preprocess_source_concepts {len(errors)}") if len(errors) > 0: code_errors.extend(errors) logger.debug(f" Length of code_errors {len(code_errors)}") - # Map + # Map source concepts codes to target codes # if processing a source coding list with categorical data if ( "actions" in concept_set["file"] @@ -708,11 +718,26 @@ def map_target_code_type(phen_path, phenotype, target_code_type): for cat, grp in df_grp: if cat == concept_set["file"]["category"]: grp = grp.drop(columns=[divide_col]) # delete categorical column - out = map_file( - grp, target_code_type, out, concept_name=concept_set["name"] + trans_out = translate_codes( + grp, + source_code_types=concept_set["file"]["columns"], + target_code_type=target_code_type, + concept_name=concept_set["name"] ) + out = pd.concat([out, trans_out]) else: - out = map_file(df, target_code_type, out, concept_name=concept_set["name"]) + trans_out = translate_codes( + df, + source_code_types=concept_set["file"]["columns"], + target_code_type=target_code_type, + concept_name=concept_set["name"]) + out = pd.concat([out, trans_out]) + logger.debug("TEST") + logger.debug(df.columns) + logger.debug(df.head) + + logger.debug(out.columns) + logger.debug(out.head) if len(code_errors) > 0: logger.error(f"The map processing has {len(code_errors)} errors") @@ -742,11 +767,10 @@ def map_target_code_type(phen_path, phenotype, target_code_type): # save concept sets as separate files concept_set_path = phen_path / CSV_PATH / target_code_type - # empty the concept-set directory if it exists but keep the .git file - git_items = [".git", ".gitkeep"] + # empty the concept-set directory except for hiddle files, e.g. .git if concept_set_path.exists(): for item in concept_set_path.iterdir(): - if item not in git_items: + if not item.name.startswith("."): item.unlink() else: concept_set_path.mkdir(parents=True, exist_ok=True) -- GitLab