From 46d4752f75e4d43f2741acdb2aa5bf467303a1ce Mon Sep 17 00:00:00 2001 From: Jakub Dylag <jjd1c23@soton.ac.uk> Date: Fri, 28 Mar 2025 17:12:55 +0000 Subject: [PATCH] Allow multiple files per concept set - loop over map function --- acmc/phen.py | 117 ++++++++++++++++++++++++++------------------------- 1 file changed, 60 insertions(+), 57 deletions(-) diff --git a/acmc/phen.py b/acmc/phen.py index 4893144..119a4c9 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -818,67 +818,70 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st code_errors = [] # Process each folder in codes section - for concept_set in phenotype["concept_sets"]: - _logger.debug(f"--- {concept_set['file']} ---") - - # Load code file - codes_file_path = Path(concepts_path / concept_set["file"]["path"]) - df = _read_table_file(codes_file_path) - - # process structural actions - df = _process_actions(df, concept_set) - - # preprocessing and validate of source concepts - _logger.debug("Processing and validating source concept codes") - df, errors = _preprocess_source_concepts( - df, - concept_set, - codes_file_path, - ) - - # create df with just the source code columns - source_column_names = list(concept_set["file"]["columns"].keys()) - source_df = df[source_column_names] + for files in phenotype["concept_sets"]: + concept_set_name = files["name"] + for concept_set in files["files"]: + + _logger.debug(f"--- {concept_set} ---") + + # Load code file + codes_file_path = Path(concepts_path / concept_set["path"]) + df = _read_table_file(codes_file_path) + + # process structural actions + df = _process_actions(df, concept_set) + + # preprocessing and validate of source concepts + _logger.debug("Processing and validating source concept codes") + df, errors = _preprocess_source_concepts( + df, + concept_set, + codes_file_path, + ) - _logger.debug(source_df.columns) - _logger.debug(source_df.head()) + # create df with just the source code columns + source_column_names = list(concept_set["columns"].keys()) + source_df = df[source_column_names] - _logger.debug( - f"Length of errors from _preprocess_source_concepts {len(errors)}" - ) - if len(errors) > 0: - code_errors.extend(errors) - _logger.debug(f" Length of code_errors {len(code_errors)}") + _logger.debug(source_df.columns) + _logger.debug(source_df.head()) - # Map source concepts codes to target codes - # if processing a source coding list with categorical data - if ( - "actions" in concept_set["file"] - and "divide_col" in concept_set["file"]["actions"] - and len(df) > 0 - ): - divide_col = concept_set["file"]["actions"]["divide_col"] - _logger.debug(f"Action: Dividing Table by {divide_col}") - _logger.debug(f"column into: {df[divide_col].unique()}") - df_grp = df.groupby(divide_col) - for cat, grp in df_grp: - if cat == concept_set["file"]["category"]: - grp = grp.drop(columns=[divide_col]) # delete categorical column - source_df = grp[source_column_names] - trans_out = translate_codes( - source_df, - target_code_type=target_code_type, - concept_name=concept_set["name"], - ) - out = pd.concat([out, trans_out]) - else: - source_df = df[source_column_names] - trans_out = translate_codes( - source_df, - target_code_type=target_code_type, - concept_name=concept_set["name"], + _logger.debug( + f"Length of errors from _preprocess_source_concepts {len(errors)}" ) - out = pd.concat([out, trans_out]) + if len(errors) > 0: + code_errors.extend(errors) + _logger.debug(f" Length of code_errors {len(code_errors)}") + + # Map source concepts codes to target codes + # if processing a source coding list with categorical data + if ( + "actions" in concept_set + and "divide_col" in concept_set["actions"] + and len(df) > 0 + ): + divide_col = concept_set["actions"]["divide_col"] + _logger.debug(f"Action: Dividing Table by {divide_col}") + _logger.debug(f"column into: {df[divide_col].unique()}") + df_grp = df.groupby(divide_col) + for cat, grp in df_grp: + if cat == concept_set["category"]: + grp = grp.drop(columns=[divide_col]) # delete categorical column + source_df = grp[source_column_names] + trans_out = translate_codes( + source_df, + target_code_type=target_code_type, + concept_name=concept_set_name, + ) + out = pd.concat([out, trans_out]) + else: + source_df = df[source_column_names] + trans_out = translate_codes( + source_df, + target_code_type=target_code_type, + concept_name=concept_set_name, + ) + out = pd.concat([out, trans_out]) if len(code_errors) > 0: _logger.error(f"The map processing has {len(code_errors)} errors") -- GitLab