From 8c8ce07c60b72821d5d2cbceb9e5ce6fe931ce2b Mon Sep 17 00:00:00 2001 From: Michael Boniface <m.j.boniface@soton.ac.uk> Date: Wed, 19 Feb 2025 20:24:12 +0000 Subject: [PATCH] moved metadata to preprocess function --- acmc/parse.py | 7 +------ acmc/phen.py | 35 ++++++++++++++++++----------------- 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/acmc/parse.py b/acmc/parse.py index 349ab01..790cad6 100644 --- a/acmc/parse.py +++ b/acmc/parse.py @@ -55,7 +55,6 @@ class Proto(): def process(self, codes, codes_file, ignore_errors=False): """ identify issues that do not pass and fix them with define/d process """ errors = [] - logger.debug("IN PROCESS") # Iter through each item in check. for msg, cond, fix in self.checks: # Check if any codes fail the check to False @@ -79,11 +78,7 @@ class Proto(): def verify(self, codes, codes_file): """ verify codes in codes file """ conds = np.array([]) - logger.debug("IN VERIFY") - - logger.debug(codes_file) - logger.debug(f"TYPE {type(codes)}") - logger.debug(codes) + # Iter through each item in check. for msg, cond, process in self.checks: # run conditional check diff --git a/acmc/phen.py b/acmc/phen.py index c576e7f..5b81bc3 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -344,16 +344,23 @@ def preprocess_code(out, codes, codes_file, checker, output_col, metadata_df): return out # Perform QA Checks on columns individually and append to df -def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=None, translate=True,): +def preprocess(df, file, target_code_type=None, codes_file=None, translate=True,): """ Parses each column individually - Order and length will not be preserved! """ out = pd.DataFrame([]) # create output df to append to - logger.debug(f"CODES file {codes_file}") + + meta_columns = [] # meta columns to keep with codes + if "actions" in file and "divide_col" in file["actions"]: + meta_columns += [file["actions"]["divide_col"]] + # TODO: enable metacolumns to be outputted - problem with map_file appending + if "metadata" in file["columns"]: + meta_columns += file["columns"]["metadata"] + if target_code_type and not translate: # QA only on target codes - if target_code_type in columns: + if target_code_type in file[columns]: logger.info(f"Processing {target_code_type} Codes...") out = preprocess_code(out=out, - codes=df[columns[target_code_type]].dropna(), + codes=df[file[columns][target_code_type]].dropna(), codes_file=codes_file, checker=code_types[target_code_type](file_path), output_col=target_code_type, @@ -363,15 +370,16 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=N else: # QA for every code type in df run preprocess_code() for k, v in code_types.items(): - if k in columns: + if k in file['columns']: logger.info(f"Processing {k} Codes...") out = preprocess_code(out=out, - codes=df[columns[k]].dropna(), + codes=df[file['columns'][k]].dropna(), codes_file=codes_file, checker=v(), output_col=k, metadata_df=df[meta_columns]) - return out + + return out, meta_columns # Translate Df with multiple codes into single code type Series def convert_codes(df, target, translate): @@ -469,15 +477,8 @@ def map(phen_dir, target_code_type, translate=True): # Preprocessing & Validation Checks logger.debug("Processing and validating code formats") - meta_columns = [] # meta columns to keep with codes - if "actions" in file and "divide_col" in file["actions"]: - meta_columns += [file["actions"]["divide_col"]] - # TODO: enable metacolumns to be outputted - problem with map_file appending - if "metadata" in file["columns"]: - meta_columns += file["columns"]["metadata"] - df = preprocess(df, - file["columns"], - meta_columns=meta_columns, + df, meta_columns = preprocess(df, + file, codes_file=str(codes_file_path.resolve()), target_code_type=target_code_type, translate=translate) @@ -486,7 +487,7 @@ def map(phen_dir, target_code_type, translate=True): if ("actions" in file and "divide_col" in file["actions"] and len(df) > 0): divide_col = file["actions"]["divide_col"] logger.debug("Action: Dividing Table by", divide_col, "column into: ", df[divide_col].unique(),) - df = df.groupby(divide_col) + df = df.groupby(divide_col) # Map to Concept/Phenotype if len(df.index) != 0: -- GitLab