diff --git a/acmc/phen.py b/acmc/phen.py index 43ad09eee7db27f748bfec0b3128763b73a45a75..3157b52b7c671d5ced5f2a3d4c5141e12de2ba3d 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -392,41 +392,39 @@ def preprocess_codes(df, concept_set, code_file_path, target_code_type=None): out = pd.DataFrame([]) # create output df to append to code_errors = [] # list of errors from processing - metadata_df = pd.DataFrame() - meta_columns = [] # meta columns to keep with codes + # TODO: Is there a better way of processing this action as it's distributed across + # different parts of the programme. if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"]: - meta_columns += [concept_set["file"]["actions"]["divide_col"]] - metadata_df = df[meta_columns] - # TODO: enable metacolumns to be outputted - problem with map_file appending - # if "metadata" in file["columns"]: - # meta_columns += file["columns"]["metadata"] + divide_col_df = df[concept_set["file"]["actions"]["divide_col"]] + else: + divide_col_df = pd.DataFrame() # Preprocess codes code_types = parse.CodeTypeParser().code_types - for code_type_name, code_type_parser in code_types.items(): - if code_type_name in concept_set["file"]["columns"]: - logger.info(f"Processing {code_type_name} codes...") - - # get code types - codes = df[concept_set["file"]["columns"][code_type_name]].dropna() - codes = codes.astype(str) # convert to string - codes = codes.str.strip() # remove excess spaces - - # process codes, validating them using parser and returning the errors - codes, errors = code_type_parser.process(codes, code_file_path) - if len(errors) > 0: - code_errors.extend(errors) - logger.warning(f"Codes validation failed with {len(errors)} errors") - - # add metadata columns - out = pd.concat( - [out, pd.DataFrame({code_type_name: codes}).join(metadata_df)], - ignore_index=True, - ) + for code_type in concept_set["file"]["columns"]: + parser = code_types[code_type] + logger.info(f"Processing {code_type} codes...") - return out, meta_columns, code_errors + # get code types + codes = df[concept_set["file"]["columns"][code_type]].dropna() + codes = codes.astype(str) # convert to string + codes = codes.str.strip() # remove excess spaces + # process codes, validating them using parser and returning the errors + codes, errors = parser.process(codes, code_file_path) + if len(errors) > 0: + code_errors.extend(errors) + logger.warning(f"Codes validation failed with {len(errors)} errors") + # append to output dataframe + out = pd.concat( + [out, pd.DataFrame({code_type: codes}).join(divide_col_df)], + ignore_index=True, + ) + + return out, code_errors + + # Translate Df with multiple codes into single code type Series def translate_codes(df, target_code_type): codes = pd.Series([], dtype=str) @@ -561,7 +559,7 @@ def map(phen_dir, target_code_type): for concept_set in phenotype["concept_sets"]: logger.debug(f"--- {concept_set['file']} ---") - # Load Code File + # Load code file codes_file_path = Path(codes_path / concept_set["file"]["path"]) df = read_table_file(codes_file_path) @@ -570,7 +568,7 @@ def map(phen_dir, target_code_type): # Preprocessing & Validation Checks logger.debug("Processing and validating code formats") - df, meta_columns, errors = preprocess_codes( + df, errors = preprocess_codes( df, concept_set, codes_file_path, @@ -580,29 +578,16 @@ def map(phen_dir, target_code_type): logger.debug(f"Length of errors from preprocess {len(errors)}") if len(errors) > 0: code_errors.extend(errors) - logger.debug(f" Length of code_errors {len(code_errors)}") - - # partition table by categorical column + logger.debug(f" Length of code_errors {len(code_errors)}") + + # Map + # if processing a source coding list with categorical data if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"] and len(df) > 0: divide_col = concept_set["file"]["actions"]["divide_col"] logger.debug(f"Action: Dividing Table by {divide_col}") - logger.debug(df.head()) logger.debug(f"column into: {df[divide_col].unique()}") - df = df.groupby(divide_col) - - # Map to Concept/Phenotype - # TODO: This code needs refactory as it seems handling of the concept_set_categories should happen at another place - logger.debug(f"instance of df before if: {type(df)}") - if isinstance(df, pd.core.frame.DataFrame): - out = map_file( - df, - target_code_type, - out, - concept_name=concept_set['name'] - ) - elif isinstance(df, pd.core.groupby.generic.DataFrameGroupBy): - for cat, grp in df: - # what if there's no category, there's going to be an error + df_grp = df.groupby(divide_col) + for cat, grp in df_grp: if cat == concept_set["file"]["category"]: grp = grp.drop( columns=[divide_col] @@ -612,12 +597,14 @@ def map(phen_dir, target_code_type): target_code_type, out, concept_name=concept_set['name'] - ) - else: - logger.debug(f"instance of df: {type(df)}") - # raise AttributeError( - # f"File {file} has either no concept_set or conceot_set_categories or the instance of dataframe objectives associated is incorrect, concept_set must be a DataFrame, conceot_set_categories must be pd.core.groupby.generic.DataFrameGroupBy" - # ) + ) + else: + out = map_file( + df, + target_code_type, + out, + concept_name=concept_set['name'] + ) if len(code_errors) > 0: logger.error(f"The map processing has {len(code_errors)} errors")