standardised code_type import in phen

19918ce3 · mjbonifa · 8c8ce07c · 19918ce3
Commit 19918ce3 authored 4 months ago by mjbonifa
--- a/acmc/phen.py
+++ b/acmc/phen.py
@@ -14,8 +14,7 @@ from pathlib import Path
 from urllib.parse import urlparse, urlunparse
 # acmc imports 
-from acmc import trud, omop
+from acmc import trud, omop, parse
-from acmc.parse import code_types
 from acmc.omop import publish_concept_sets, setup
 # setup logging
@@ -258,7 +257,7 @@ def validate(phen_dir):
 				# check columns specified are a supported medical coding type
 				for column in file['columns']:
-					if column not in code_types and column != 'metadata':
+					if column not in parse.code_types and column != 'metadata':
 						validation_errors.append(f"Column type {column} for file {concept_code_file_path} is not supported")
 				# check the actions are supported
@@ -328,6 +327,28 @@ def process_actions(df, file):
 	return df
+def log_invalid_code(codes, mask, code_type=None, file_path=None, cause=None):
+	# print("ERROR WITH CODES", file_path, codes[~mask])
+	errors = pd.DataFrame([])
+	errors["CONCEPT"] = codes[~mask].astype(str)
+	errors["VOCABULARY"] = code_type
+	errors["SOURCE"] = file_path
+	errors["CAUSE"] = cause
+	#append to error log csv
+	if os.path.exists(log_errors_path):
+		print("FILE EXISTS")
+		df_error = pd.read_csv(log_errors_path)
+		df_error = pd.concat([df_error, errors])
+		df_error.to_csv(log_errors_path, index=False)
+	else:
+		print("FILE NOT EXIST")
+		df_error = errors
+		df_error.to_csv(log_errors_path, index=False)
+	return codes[mask]
 def preprocess_code(out, codes, codes_file, checker, output_col, metadata_df):
 	# preprocess codes
@@ -338,6 +359,7 @@ def preprocess_code(out, codes, codes_file, checker, output_col, metadata_df):
 	if len(errors) > 0:
 		raise Exception(f"Code validation failed with {len(errors)} errors")
 	# add metadata columns
 	out = pd.concat([out, pd.DataFrame({output_col: codes}).join(metadata_df)], ignore_index=True)
@@ -362,14 +384,14 @@ def preprocess(df, file, target_code_type=None, codes_file=None, translate=True,
 			out = preprocess_code(out=out,
 								  codes=df[file[columns][target_code_type]].dropna(),
 								  codes_file=codes_file,
-								  checker=code_types[target_code_type](file_path),
+								  checker=parse.code_types[target_code_type](),
 								  output_col=target_code_type,
 								  metadata_df=df[meta_columns])
 		else:
 			logger.warning(f"No {target_code_type} Codes to process")
 	else:
 		# QA for every code type in df run preprocess_code()
-		for k, v in code_types.items():
+		for k, v in parse.code_types.items():
 			if k in file['columns']:
 				logger.info(f"Processing {k} Codes...")
 				out = preprocess_code(out=out,