diff --git a/acmc/phen.py b/acmc/phen.py index 15c2e49bcb7890c5dffed2d655ab31f510937851..4ac53e222c20f34ceddcd6878e8f30826289db11 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -187,7 +187,7 @@ def init(phen_dir, remote_url): def validate(phen_dir): """Validates the phenotype directory is a git repo with standard structure""" - logger.info(f"Validating phenotype configuration {phen_dir}") + logger.info(f"Validating phenotype: {phen_dir}") phen_path = Path(phen_dir) if not phen_path.is_dir(): raise NotADirectoryError(f"Error: '{phen_path}' is not a directory") @@ -378,80 +378,82 @@ def preprocess(df, file, target_code_type=None, codes_file=None, translate=True, if "metadata" in file["columns"]: meta_columns += file["columns"]["metadata"] - if target_code_type and not translate: - # QA only on target codes - if target_code_type in file['columns']: - logger.info(f"Processing {target_code_type} Codes...") +# if target_code_type and not translate: +# # QA only on target codes +# if target_code_type in file['columns']: +# logger.info(f"Processing {target_code_type} Codes...") +# out = preprocess_code(out=out, +# codes=df[file['columns'][target_code_type]].dropna(), +# codes_file=codes_file, +# checker=parse.code_types[target_code_type], +# output_col=target_code_type, +# metadata_df=df[meta_columns]) +# else: +# logger.warning(f"No {target_code_type} Codes to process") +# else: + + # QA for every code type in df run preprocess_code() + for code_type_name, code_type in parse.code_types.items(): + if code_type_name in file['columns']: + logger.info(f"Processing {code_type_name} Codes...") out = preprocess_code(out=out, - codes=df[file['columns'][target_code_type]].dropna(), + codes=df[file['columns'][code_type_name]].dropna(), codes_file=codes_file, - checker=parse.code_types[target_code_type], - output_col=target_code_type, + checker=code_type, + output_col=code_type_name, metadata_df=df[meta_columns]) - else: - logger.warning(f"No {target_code_type} Codes to process") - else: - # QA for every code type in df run preprocess_code() - for code_type_name, code_type in parse.code_types.items(): - if code_type_name in file['columns']: - logger.info(f"Processing {code_type_name} Codes...") - out = preprocess_code(out=out, - codes=df[file['columns'][code_type_name]].dropna(), - codes_file=codes_file, - checker=code_type, - output_col=code_type_name, - metadata_df=df[meta_columns]) return out, meta_columns # Translate Df with multiple codes into single code type Series -def convert_codes(df, target, translate): +def convert_codes(df, target_code_type): codes = pd.Series([], dtype=str) - - # Append target column (if exists) - doesn't need conversion - if target in df.columns: - logger.debug(f"Has {len(df)} {target} in file") - codes = pd.concat([codes, df[target]]) - if translate: - # Convert codes to target type - logger.info(f"target type {target}") - for col_name in df.columns[df.columns != target]: - filename = f"{col_name}_to_{target}.parquet" + # Convert codes to target type + logger.info(f"Converting to target code type {target_code_type}") + for col_name in df.columns: + # if target code type is the same as thet source code type, no translation, just appending source as target + if col_name == target_code_type: + logger.debug(f"Target code type {target_code_type} has source code types {len(df)}, copying rather than translating") + codes = pd.concat([codes, df[target_code_type]]) + else: + filename = f"{col_name}_to_{target_code_type}.parquet" map_path = trud.TRUD_PROCESSED_DIR / filename if map_path.exists(): col = df[col_name] df_map = pd.read_parquet(map_path) - translated = pd.merge(col, df_map, how="left")[target] # merge on corresponding codes and take target colum + # merge on corresponding codes and take target column + translated = pd.merge(col, df_map, how="left")[target_code_type] # TODO: BUG mask does not match column # log_invalid_code(col, - # ~translated.isna(), - # code_type=col_name, - # cause=f"Translation to {target}") #log codes with no translation + # ~translated.isna(), + # code_type=col_name, + # cause=f"Translation to {target}") #log codes with no translation codes = pd.concat([codes, translated]) # merge to output else: - logger.warning(f"No mapping from {col_name} to {target}, file {str(map_path.resolve())} does not exist") - else: - logger.warning(f"NOT TRANSLATING {col_name}") + logger.warning(f"No mapping from {col_name} to {target_code_type}, file {str(map_path.resolve())} does not exist") return codes # Append file's codes to output Df with concept -def map_file(df, target_code_type, out, concepts, meta_columns=[], translate=True): - # seperate out meta_columns - metadata_df = df[meta_columns] - df = df.drop(columns=meta_columns) - codes = convert_codes(df, target_code_type, translate) - codes = codes.dropna() # delete NaNs - - # Append to out df - if len(codes) > 0: - codes = pd.DataFrame({"CONCEPT": codes}) - codes = codes.join(metadata_df) - for concept in concepts: - codes["CONCEPT_SET"] = np.repeat(concept.strip(), len(codes)) - out = pd.concat([out, codes]) - return out +def map_file(df, target_code_type, out, concepts, meta_columns=[]): + # seperate out meta_columns + metadata_df = df[meta_columns] + df = df.drop(columns=meta_columns) + codes = convert_codes(df, target_code_type) + codes = codes.dropna() # delete NaNs + + # Append to out df + if len(codes) > 0: + codes = pd.DataFrame({"CONCEPT": codes}) + codes = codes.join(metadata_df) + for concept in concepts: + codes["CONCEPT_SET"] = np.repeat(concept.strip(), len(codes)) + out = pd.concat([out, codes]) + else: + logger.debug(f"No codes converted with target code type {target_code_type}") + + return out def sql_row_exist(conn, table, column, value): # Execute and check if a result exists @@ -463,7 +465,7 @@ def sql_row_exist(conn, table, column, value): return exists def map(phen_dir, target_code_type, translate=True): - logger.info(f"Processing phenotype directory: {phen_dir}") + logger.info(f"Processing phenotype: {phen_dir}") logger.debug(f"Target coding format: {target_code_type}") logger.debug(f"Translating: {translate}") @@ -518,8 +520,7 @@ def map(phen_dir, target_code_type, translate=True): out = map_file(df, target_code_type, out, concepts=file["concept_set"], - meta_columns=meta_columns, - translate=translate,) + meta_columns=meta_columns) elif ("concept_set_categories" in file) and isinstance(df, pd.core.groupby.generic.DataFrameGroupBy): meta_columns.remove(divide_col) # delete categorical column for cat, grp in df: