From 633702e36c6d30963a74dbf5039d936d75e5ee7a Mon Sep 17 00:00:00 2001
From: Michael Boniface <m.j.boniface@soton.ac.uk>
Date: Wed, 19 Feb 2025 20:07:48 +0000
Subject: [PATCH] checked metadata_df

---
 acmc/phen.py | 48 ++++++++++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/acmc/phen.py b/acmc/phen.py
index 282186c..c576e7f 100644
--- a/acmc/phen.py
+++ b/acmc/phen.py
@@ -313,15 +313,33 @@ def read_table_file(path, excel_sheet=None):
 		
     return df
 
-def preprocess_code(out, codes, codes_file, checker, output_col, df_meta):
+def process_actions(df, file):
+	# Perform Structural Changes to file before preprocessing
+	logger.debug("Processing file structural actions")
+	if ("actions" in file and "split_col" in file["actions"] and "codes_col" in file["actions"]):
+		split_col = file["actions"]["split_col"]
+		codes_col = file["actions"]["codes_col"]
+		logger.debug("Action: Splitting", split_col, "column into:", df[split_col].unique(),)
+		codes = df[codes_col]
+		oh = pd.get_dummies(df[split_col], dtype=bool)  # one hot encode
+		oh = oh.where((oh != True), codes, axis=0)  # fill in 1s with codes
+		oh[oh == False] = np.nan  # replace 0s with None
+		df = pd.concat([df, oh], axis=1)  # merge in new columns
+
+	return df
+
+def preprocess_code(out, codes, codes_file, checker, output_col, metadata_df):
+
+	# preprocess codes
 	codes = codes.astype(str)  # convert to string
-	codes = codes.str.strip()  # remove excess spaces
+	codes = codes.str.strip()  # remove excess spaces	
 	
-	codes, errors = checker.process(codes, codes_file)  # resolve any identified issues
+	codes, errors = checker.process(codes, codes_file)  
 	if len(errors) > 0:
 		raise Exception(f"Code validation failed with {len(errors)} errors")
+		
 	# add metadata columns
-	out = pd.concat([out, pd.DataFrame({output_col: codes}).join(df_meta)], ignore_index=True)
+	out = pd.concat([out, pd.DataFrame({output_col: codes}).join(metadata_df)], ignore_index=True)
 	
 	return out
 
@@ -339,7 +357,7 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=N
 								  codes_file=codes_file,
 								  checker=code_types[target_code_type](file_path),
 								  output_col=target_code_type,
-								  df_meta=df[meta_columns])
+								  metadata_df=df[meta_columns])
 		else:
 			logger.warning(f"No {target_code_type} Codes to process")
 	else:
@@ -352,7 +370,7 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=N
 									  codes_file=codes_file,
 									  checker=v(),
 									  output_col=k,
-									  df_meta=df[meta_columns])
+									  metadata_df=df[meta_columns])
 	return out
 
 # Translate Df with multiple codes into single code type Series
@@ -390,7 +408,7 @@ def convert_codes(df, target, translate):
 # Append file's codes to output Df with concept
 def map_file(df, target_code_type, out, concepts, meta_columns=[], translate=True):
     # seperate out meta_columns
-    df_meta = df[meta_columns]
+    metadata_df = df[meta_columns]
     df = df.drop(columns=meta_columns)
     codes = convert_codes(df, target_code_type, translate)
     codes = codes.dropna()  # delete NaNs
@@ -398,7 +416,7 @@ def map_file(df, target_code_type, out, concepts, meta_columns=[], translate=Tru
     # Append to out df
     if len(codes) > 0:
         codes = pd.DataFrame({"CONCEPT": codes})
-        codes = codes.join(df_meta)
+        codes = codes.join(metadata_df)
         for concept in concepts:
             codes["CONCEPT_SET"] = np.repeat(concept.strip(), len(codes))
             out = pd.concat([out, codes])
@@ -446,18 +464,8 @@ def map(phen_dir, target_code_type, translate=True):
 			else:
 				df = read_table_file(path=codes_file_path)
 
-			# Perform Structural Changes to file before preprocessing
-			# split column with multiple code types
-			logger.debug("Processing file structural actions")
-			if ("actions" in file and "split_col" in file["actions"] and "codes_col" in file["actions"]):
-				split_col = file["actions"]["split_col"]
-				codes_col = file["actions"]["codes_col"]
-				logger.debug("Action: Splitting", split_col, "column into:", df[split_col].unique(),)
-				codes = df[codes_col]
-				oh = pd.get_dummies(df[split_col], dtype=bool)  # one hot encode
-				oh = oh.where((oh != True), codes, axis=0)  # fill in 1s with codes
-				oh[oh == False] = np.nan  # replace 0s with None
-				df = pd.concat([df, oh], axis=1)  # merge in new columns
+			# process structural actions
+			df = process_actions(df, file)
 
 			# Preprocessing & Validation Checks		
 			logger.debug("Processing and validating code formats")
-- 
GitLab