Skip to content
Snippets Groups Projects
Commit 633702e3 authored by mjbonifa's avatar mjbonifa
Browse files

checked metadata_df

parent 75615a67
Branches
Tags
No related merge requests found
...@@ -313,15 +313,33 @@ def read_table_file(path, excel_sheet=None): ...@@ -313,15 +313,33 @@ def read_table_file(path, excel_sheet=None):
return df return df
def preprocess_code(out, codes, codes_file, checker, output_col, df_meta): def process_actions(df, file):
# Perform Structural Changes to file before preprocessing
logger.debug("Processing file structural actions")
if ("actions" in file and "split_col" in file["actions"] and "codes_col" in file["actions"]):
split_col = file["actions"]["split_col"]
codes_col = file["actions"]["codes_col"]
logger.debug("Action: Splitting", split_col, "column into:", df[split_col].unique(),)
codes = df[codes_col]
oh = pd.get_dummies(df[split_col], dtype=bool) # one hot encode
oh = oh.where((oh != True), codes, axis=0) # fill in 1s with codes
oh[oh == False] = np.nan # replace 0s with None
df = pd.concat([df, oh], axis=1) # merge in new columns
return df
def preprocess_code(out, codes, codes_file, checker, output_col, metadata_df):
# preprocess codes
codes = codes.astype(str) # convert to string codes = codes.astype(str) # convert to string
codes = codes.str.strip() # remove excess spaces codes = codes.str.strip() # remove excess spaces
codes, errors = checker.process(codes, codes_file) # resolve any identified issues codes, errors = checker.process(codes, codes_file)
if len(errors) > 0: if len(errors) > 0:
raise Exception(f"Code validation failed with {len(errors)} errors") raise Exception(f"Code validation failed with {len(errors)} errors")
# add metadata columns # add metadata columns
out = pd.concat([out, pd.DataFrame({output_col: codes}).join(df_meta)], ignore_index=True) out = pd.concat([out, pd.DataFrame({output_col: codes}).join(metadata_df)], ignore_index=True)
return out return out
...@@ -339,7 +357,7 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=N ...@@ -339,7 +357,7 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=N
codes_file=codes_file, codes_file=codes_file,
checker=code_types[target_code_type](file_path), checker=code_types[target_code_type](file_path),
output_col=target_code_type, output_col=target_code_type,
df_meta=df[meta_columns]) metadata_df=df[meta_columns])
else: else:
logger.warning(f"No {target_code_type} Codes to process") logger.warning(f"No {target_code_type} Codes to process")
else: else:
...@@ -352,7 +370,7 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=N ...@@ -352,7 +370,7 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=N
codes_file=codes_file, codes_file=codes_file,
checker=v(), checker=v(),
output_col=k, output_col=k,
df_meta=df[meta_columns]) metadata_df=df[meta_columns])
return out return out
# Translate Df with multiple codes into single code type Series # Translate Df with multiple codes into single code type Series
...@@ -390,7 +408,7 @@ def convert_codes(df, target, translate): ...@@ -390,7 +408,7 @@ def convert_codes(df, target, translate):
# Append file's codes to output Df with concept # Append file's codes to output Df with concept
def map_file(df, target_code_type, out, concepts, meta_columns=[], translate=True): def map_file(df, target_code_type, out, concepts, meta_columns=[], translate=True):
# seperate out meta_columns # seperate out meta_columns
df_meta = df[meta_columns] metadata_df = df[meta_columns]
df = df.drop(columns=meta_columns) df = df.drop(columns=meta_columns)
codes = convert_codes(df, target_code_type, translate) codes = convert_codes(df, target_code_type, translate)
codes = codes.dropna() # delete NaNs codes = codes.dropna() # delete NaNs
...@@ -398,7 +416,7 @@ def map_file(df, target_code_type, out, concepts, meta_columns=[], translate=Tru ...@@ -398,7 +416,7 @@ def map_file(df, target_code_type, out, concepts, meta_columns=[], translate=Tru
# Append to out df # Append to out df
if len(codes) > 0: if len(codes) > 0:
codes = pd.DataFrame({"CONCEPT": codes}) codes = pd.DataFrame({"CONCEPT": codes})
codes = codes.join(df_meta) codes = codes.join(metadata_df)
for concept in concepts: for concept in concepts:
codes["CONCEPT_SET"] = np.repeat(concept.strip(), len(codes)) codes["CONCEPT_SET"] = np.repeat(concept.strip(), len(codes))
out = pd.concat([out, codes]) out = pd.concat([out, codes])
...@@ -446,18 +464,8 @@ def map(phen_dir, target_code_type, translate=True): ...@@ -446,18 +464,8 @@ def map(phen_dir, target_code_type, translate=True):
else: else:
df = read_table_file(path=codes_file_path) df = read_table_file(path=codes_file_path)
# Perform Structural Changes to file before preprocessing # process structural actions
# split column with multiple code types df = process_actions(df, file)
logger.debug("Processing file structural actions")
if ("actions" in file and "split_col" in file["actions"] and "codes_col" in file["actions"]):
split_col = file["actions"]["split_col"]
codes_col = file["actions"]["codes_col"]
logger.debug("Action: Splitting", split_col, "column into:", df[split_col].unique(),)
codes = df[codes_col]
oh = pd.get_dummies(df[split_col], dtype=bool) # one hot encode
oh = oh.where((oh != True), codes, axis=0) # fill in 1s with codes
oh[oh == False] = np.nan # replace 0s with None
df = pd.concat([df, oh], axis=1) # merge in new columns
# Preprocessing & Validation Checks # Preprocessing & Validation Checks
logger.debug("Processing and validating code formats") logger.debug("Processing and validating code formats")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment