Skip to content
Snippets Groups Projects
Commit cae3acc7 authored by mjbonifa's avatar mjbonifa
Browse files

Merge branch '27-fix-divide-action-using-metadata-columns-to-control-program-logic-2' into 'dev'

refactor: tidied up the action code for categories but still not opimimal as...

Closes #27

See merge request meldb/concepts-processing!10
parents f1f18dcd 49d78a3f
No related branches found
No related tags found
No related merge requests found
...@@ -392,41 +392,39 @@ def preprocess_codes(df, concept_set, code_file_path, target_code_type=None): ...@@ -392,41 +392,39 @@ def preprocess_codes(df, concept_set, code_file_path, target_code_type=None):
out = pd.DataFrame([]) # create output df to append to out = pd.DataFrame([]) # create output df to append to
code_errors = [] # list of errors from processing code_errors = [] # list of errors from processing
metadata_df = pd.DataFrame() # TODO: Is there a better way of processing this action as it's distributed across
meta_columns = [] # meta columns to keep with codes # different parts of the programme.
if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"]: if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"]:
meta_columns += [concept_set["file"]["actions"]["divide_col"]] divide_col_df = df[concept_set["file"]["actions"]["divide_col"]]
metadata_df = df[meta_columns] else:
# TODO: enable metacolumns to be outputted - problem with map_file appending divide_col_df = pd.DataFrame()
# if "metadata" in file["columns"]:
# meta_columns += file["columns"]["metadata"]
# Preprocess codes # Preprocess codes
code_types = parse.CodeTypeParser().code_types code_types = parse.CodeTypeParser().code_types
for code_type_name, code_type_parser in code_types.items(): for code_type in concept_set["file"]["columns"]:
if code_type_name in concept_set["file"]["columns"]: parser = code_types[code_type]
logger.info(f"Processing {code_type_name} codes...") logger.info(f"Processing {code_type} codes...")
# get code types
codes = df[concept_set["file"]["columns"][code_type_name]].dropna()
codes = codes.astype(str) # convert to string
codes = codes.str.strip() # remove excess spaces
# process codes, validating them using parser and returning the errors
codes, errors = code_type_parser.process(codes, code_file_path)
if len(errors) > 0:
code_errors.extend(errors)
logger.warning(f"Codes validation failed with {len(errors)} errors")
# add metadata columns
out = pd.concat(
[out, pd.DataFrame({code_type_name: codes}).join(metadata_df)],
ignore_index=True,
)
return out, meta_columns, code_errors # get code types
codes = df[concept_set["file"]["columns"][code_type]].dropna()
codes = codes.astype(str) # convert to string
codes = codes.str.strip() # remove excess spaces
# process codes, validating them using parser and returning the errors
codes, errors = parser.process(codes, code_file_path)
if len(errors) > 0:
code_errors.extend(errors)
logger.warning(f"Codes validation failed with {len(errors)} errors")
# append to output dataframe
out = pd.concat(
[out, pd.DataFrame({code_type: codes}).join(divide_col_df)],
ignore_index=True,
)
return out, code_errors
# Translate Df with multiple codes into single code type Series # Translate Df with multiple codes into single code type Series
def translate_codes(df, target_code_type): def translate_codes(df, target_code_type):
codes = pd.Series([], dtype=str) codes = pd.Series([], dtype=str)
...@@ -561,7 +559,7 @@ def map(phen_dir, target_code_type): ...@@ -561,7 +559,7 @@ def map(phen_dir, target_code_type):
for concept_set in phenotype["concept_sets"]: for concept_set in phenotype["concept_sets"]:
logger.debug(f"--- {concept_set['file']} ---") logger.debug(f"--- {concept_set['file']} ---")
# Load Code File # Load code file
codes_file_path = Path(codes_path / concept_set["file"]["path"]) codes_file_path = Path(codes_path / concept_set["file"]["path"])
df = read_table_file(codes_file_path) df = read_table_file(codes_file_path)
...@@ -570,7 +568,7 @@ def map(phen_dir, target_code_type): ...@@ -570,7 +568,7 @@ def map(phen_dir, target_code_type):
# Preprocessing & Validation Checks # Preprocessing & Validation Checks
logger.debug("Processing and validating code formats") logger.debug("Processing and validating code formats")
df, meta_columns, errors = preprocess_codes( df, errors = preprocess_codes(
df, df,
concept_set, concept_set,
codes_file_path, codes_file_path,
...@@ -580,29 +578,16 @@ def map(phen_dir, target_code_type): ...@@ -580,29 +578,16 @@ def map(phen_dir, target_code_type):
logger.debug(f"Length of errors from preprocess {len(errors)}") logger.debug(f"Length of errors from preprocess {len(errors)}")
if len(errors) > 0: if len(errors) > 0:
code_errors.extend(errors) code_errors.extend(errors)
logger.debug(f" Length of code_errors {len(code_errors)}") logger.debug(f" Length of code_errors {len(code_errors)}")
# partition table by categorical column # Map
# if processing a source coding list with categorical data
if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"] and len(df) > 0: if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"] and len(df) > 0:
divide_col = concept_set["file"]["actions"]["divide_col"] divide_col = concept_set["file"]["actions"]["divide_col"]
logger.debug(f"Action: Dividing Table by {divide_col}") logger.debug(f"Action: Dividing Table by {divide_col}")
logger.debug(df.head())
logger.debug(f"column into: {df[divide_col].unique()}") logger.debug(f"column into: {df[divide_col].unique()}")
df = df.groupby(divide_col) df_grp = df.groupby(divide_col)
for cat, grp in df_grp:
# Map to Concept/Phenotype
# TODO: This code needs refactory as it seems handling of the concept_set_categories should happen at another place
logger.debug(f"instance of df before if: {type(df)}")
if isinstance(df, pd.core.frame.DataFrame):
out = map_file(
df,
target_code_type,
out,
concept_name=concept_set['name']
)
elif isinstance(df, pd.core.groupby.generic.DataFrameGroupBy):
for cat, grp in df:
# what if there's no category, there's going to be an error
if cat == concept_set["file"]["category"]: if cat == concept_set["file"]["category"]:
grp = grp.drop( grp = grp.drop(
columns=[divide_col] columns=[divide_col]
...@@ -612,12 +597,14 @@ def map(phen_dir, target_code_type): ...@@ -612,12 +597,14 @@ def map(phen_dir, target_code_type):
target_code_type, target_code_type,
out, out,
concept_name=concept_set['name'] concept_name=concept_set['name']
) )
else: else:
logger.debug(f"instance of df: {type(df)}") out = map_file(
# raise AttributeError( df,
# f"File {file} has either no concept_set or conceot_set_categories or the instance of dataframe objectives associated is incorrect, concept_set must be a DataFrame, conceot_set_categories must be pd.core.groupby.generic.DataFrameGroupBy" target_code_type,
# ) out,
concept_name=concept_set['name']
)
if len(code_errors) > 0: if len(code_errors) > 0:
logger.error(f"The map processing has {len(code_errors)} errors") logger.error(f"The map processing has {len(code_errors)} errors")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment