From 945ff7caa9c45bbe69b1478ec02cd25cc370e8b3 Mon Sep 17 00:00:00 2001
From: Michael Boniface <m.j.boniface@soton.ac.uk>
Date: Sat, 15 Feb 2025 17:27:34 +0000
Subject: [PATCH] fixed default output and error files

---
 README.md | 90 ++++++++++++++++++++++++++++---------------------------
 acmc.py   | 12 ++++----
 map.py    | 88 +++++++++++++----------------------------------------
 parse.py  | 42 +++++++-------------------
 4 files changed, 83 insertions(+), 149 deletions(-)

diff --git a/README.md b/README.md
index 35c3796..c2dd4da 100644
--- a/README.md
+++ b/README.md
@@ -125,51 +125,53 @@ Phenotypes are defined in a JSON configuration file. The file describes how sour
 An example concept set and code list for Abdominal Pain is show below:
 
 ```json
-
 {
-	"concept_sets": {
-		"version": "3.2.10",
-		"omop": {
-			"vocabulary_id": "MELDB",
-			"vocabulary_name": "Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity",
-			"vocabulary_reference": "https://www.it-innovation.soton.ac.uk/projects/meldb"
-		},
-		"concept_set": [
-			{
-				"concept_set_name": "ABDO_PAIN",
-				"concept_set_status": "AGREED",
-				"metadata": {
-					"#": "18",
-					"CONCEPT DESCRIPTION": "Abdominal pain",
-					"CONCEPT TYPE": "Workload indicator (symptom)",
-					"DATE ADDED ": "2023-08-25",
-					"REQUEST REASON ": "Clinician SF - requested by email - symptom example from Qualitative Evidence Synthesis",
-					"SOURCE INFO": "YES",
-					"FUNCTION": "QUERY BY CODING LIST",
-					"FUNCTION.1": "https://clinicalcodes.rss.mhs.man.ac.uk/",
-					"CODING LIST": "https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/codes/ClinicalCodes.org%20from%20the%20University%20of%20Manchester/Symptom%20code%20lists/Abdominal%20pain/res176-abdominal-pain.csv ",
-					"NOTES": "2023-09-08: Clinical SF confirmed that the clinical view would be that this would need to be recurrent or persistent.",
-				}
-			},
-	}
-	"codes": [
-		{
-			"folder": "codes/ClinicalCodes.org from the University of Manchester",
-			"description": "SF's clinical codes - downloaded 16/11/23",
-			"files": [
-				{
-					"file": "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv",
-					"columns": {
-						"read2_code": "code",
-						"metadata": [
-							"description"
-						]
-					},
-					"concept_set": [
-						"ABDO_PAIN"
-					]
-				},
-		}
+    "concept_sets": {
+        "version": "3.2.10",
+        "omop": {
+            "vocabulary_id": "MELDB",
+            "vocabulary_name": "Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity",
+            "vocabulary_reference": "https://www.it-innovation.soton.ac.uk/projects/meldb"
+        },
+        "concept_set": [
+            {
+                "concept_set_name": "ABDO_PAIN",
+                "concept_set_status": "AGREED",
+                "metadata": {
+                    "#": "18",
+                    "CONCEPT DESCRIPTION": "Abdominal pain",
+                    "CONCEPT TYPE": "Workload indicator (symptom)",
+                    "DATE ADDED ": "2023-08-25",
+                    "REQUEST REASON ": "Clinician SF - requested by email - symptom example from Qualitative Evidence Synthesis",
+                    "SOURCE INFO": "YES",
+                    "FUNCTION": "QUERY BY CODING LIST",
+                    "FUNCTION.1": "https://clinicalcodes.rss.mhs.man.ac.uk/",
+                    "CODING LIST": "https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/codes/ClinicalCodes.org%20from%20the%20University%20of%20Manchester/Symptom%20code%20lists/Abdominal%20pain/res176-abdominal-pain.csv ",
+                    "NOTES": "2023-09-08: Clinical SF confirmed that the clinical view would be that this would need to be recurrent or persistent."
+                }
+            }
+        ]
+    },
+    "codes": [
+        {
+            "folder": "clinical-codes-org",
+            "description": "SF's clinical codes - downloaded 16/11/23",
+            "files": [
+                {
+                    "file": "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv",
+                    "columns": {
+                        "read2_code": "code",
+                        "metadata": [
+                            "description"
+                        ]
+                    },
+                    "concept_set": [
+                        "ABDO_PAIN"
+                    ]
+                }
+            ]
+        }
+    ]
 }
 ```
 
diff --git a/acmc.py b/acmc.py
index f7c5817..ae4b051 100644
--- a/acmc.py
+++ b/acmc.py
@@ -4,6 +4,8 @@ import trud
 import omop
 import map
 
+from pathlib import Path
+
 def trud_install(args):
     """Handle the `trud install` command."""
     print(f"Installing TRUD")
@@ -31,7 +33,7 @@ def omop_delete(args):
 def map_process(args):
 	"""Handle the `map process` command."""    
 	print(f"Processing map with phenotype config file: {args.config_file}")
-	print(f"Output directory: {args.output_dir}")
+	print(f"Output directory: {args.output_file}")
 	print(f"Target coding format: {args.target_coding}")	
 	if args.translate:
 		print("Translating code types.")
@@ -51,8 +53,8 @@ def map_process(args):
 				args.target_coding,
 				args.translate,
 				args.verify,
-				args.error_log,
-				output_path="MELD_concepts_read.csv")
+				error_path=Path(args.error_log),
+				output_path=Path(args.output_file))
 
 	print(f"Phenotype processing completed")
 
@@ -95,15 +97,15 @@ def main():
 	map_process_parser = map_subparsers.add_parser("process", help="Process map configuration file")
 	map_process_parser.add_argument("-c", "--config-file", required=True, help="Phenotype configuration file")
 	map_process_parser.add_argument("-s", "--source-codes-dir", required=True, help="Source codes root directory")
-	map_process_parser.add_argument("-o", "--output-dir", required=True, help="Output directory for CSV or OMOP database")
 	map_process_parser.add_argument("-t", "--target-coding", required=True, choices=['read2', 'read3', 'icd10', 'snomed', 'opcs4'], help="Specify the target coding (read2, read3, icd10, snomed, opcs4)")
+	map_process_parser.add_argument("-o", "--output-file", type=str, default=str(map.OUTPUT_PATH.resolve()), help="Output directory for CSV or OMOP database")	
 	
 	# Flags
 	map_process_parser.add_argument("-tr", "--translate", action="store_true", default=False, help="Do not translate code types")
 	map_process_parser.add_argument("-v", "--verify", action="store_true", default=False, help="Do not verify codes")
 	
 	# Error log file
-	map_process_parser.add_argument("-l", "--error-log", type=str, default='error.csv', help="Filepath to save error log to")
+	map_process_parser.add_argument("-l", "--error-log", type=str, default=str(map.ERROR_PATH.resolve()), help="Filepath to save error log to")
 
 	# Set the function to call when 'process' subcommand is used
 	map_process_parser.set_defaults(func=map_process)    
diff --git a/map.py b/map.py
index c468dfa..5456de7 100644
--- a/map.py
+++ b/map.py
@@ -24,6 +24,8 @@ from omop import setup
 
 pd.set_option("mode.chained_assignment", None)
 
+OUTPUT_PATH = Path('build') / 'phenotype_mapping.csv'
+ERROR_PATH = Path('build') / 'errors.csv'
 
 def read_table_file(path, excel_sheet=None):
     """
@@ -166,8 +168,7 @@ def sql_row_exist(conn, table, column, value):
 
     return exists
 
-
-def process(config_file, source_codes_dir, target_code_type, translate=True, verify=True, log_errors_path="errors.csv", output_path="MELD_concepts_read.csv"):
+def process(config_file, source_codes_dir, target_code_type, translate=True, verify=True, error_path=ERROR_PATH, output_path=OUTPUT_PATH):
 	config_path = Path(config_file)
 	if not config_path.is_file():
 		raise FileNotFoundError(f"Error: phenotype configuration file '{config_path}' does not exist.")    	
@@ -196,27 +197,16 @@ def process(config_file, source_codes_dir, target_code_type, translate=True, ver
 
 				# Load Code File
 				if "excel_sheet" in file:
-					df = read_table_file(
-						path=file_path, excel_sheet=file["excel_sheet"]
-					)
+					df = read_table_file(path=file_path, excel_sheet=file["excel_sheet"])
 				else:
 					df = read_table_file(path=file_path)
 
 				# Perform Structural Changes to file before preprocessing
 				# split column with multiple code types
-				if (
-					"actions" in file
-					and "split_col" in file["actions"]
-					and "codes_col" in file["actions"]
-				):
+				if ("actions" in file and "split_col" in file["actions"] and "codes_col" in file["actions"]):
 					split_col = file["actions"]["split_col"]
 					codes_col = file["actions"]["codes_col"]
-					print(
-						"Action: Splitting",
-						split_col,
-						"column into:",
-						df[split_col].unique(),
-					)
+					print("Action: Splitting", split_col, "column into:", df[split_col].unique(),)
 					codes = df[codes_col]
 					oh = pd.get_dummies(df[split_col], dtype=bool)  # one hot encode
 					oh = oh.where((oh != True), codes, axis=0)  # fill in 1s with codes
@@ -231,74 +221,36 @@ def process(config_file, source_codes_dir, target_code_type, translate=True, ver
 					# TODO: enable metacolumns to be outputted - problem with map_file appending
 					if "metadata" in file["columns"]:
 						meta_columns += file["columns"]["metadata"]
-					df = preprocess(
-						df,
-						file["columns"],
-						meta_columns=meta_columns,
-						file_path=file_path,
-						target_code_type=target_code_type,
-						verify=verify,
-						translate=translate,
-					)
+					df = preprocess(df, file["columns"], meta_columns=meta_columns,	file_path=file_path, target_code_type=target_code_type,	verify=verify, translate=translate)
 				else:
 					raise Exception("No column format provided")
 
 				# partition table by categorical column
-				if (
-					"actions" in file
-					and "divide_col" in file["actions"]
-					and len(df) > 0
-				):
+				if ("actions" in file and "divide_col" in file["actions"] and len(df) > 0):
 					divide_col = file["actions"]["divide_col"]
-					print(
-						"Action: Dividing Table by",
-						divide_col,
-						"column into: ",
-						df[divide_col].unique(),
-					)
+					print("Action: Dividing Table by", divide_col, "column into: ", df[divide_col].unique(),)
 					df = df.groupby(divide_col)
 
 				# Map to MELDB Concept/Phenotype
 				if len(df) == 0:
 					pass
 					# out = df
-				elif ("concept_set" in file) and isinstance(
-					df, pd.core.frame.DataFrame
-				):
-					out = map_file(
-						df,
-						target_code_type,
-						out,
-						concepts=file["concept_set"],
-						meta_columns=meta_columns,
-						translate=translate,
-					)
-				elif ("concept_set_categories" in file) and isinstance(
-					df, pd.core.groupby.generic.DataFrameGroupBy
-				):
+				elif ("concept_set" in file) and isinstance(df, pd.core.frame.DataFrame):
+					out = map_file(df, target_code_type, out, concepts=file["concept_set"], meta_columns=meta_columns, translate=translate,)
+				elif ("concept_set_categories" in file) and isinstance(df, pd.core.groupby.generic.DataFrameGroupBy):
 					meta_columns.remove(divide_col)  # delete categorical column
 					for cat, grp in df:
-						if (
-							cat in file["concept_set_categories"].keys()
-						):  # check if category is mapped
-							grp = grp.drop(
-								columns=[divide_col]
-							)  # delete categorical column
+						if (cat in file["concept_set_categories"].keys()):  # check if category is mapped
+							grp = grp.drop(columns=[divide_col])  # delete categorical column
 							print("Category:", cat)
-							out = map_file(
-								grp,
-								target_code_type,
-								out,
-								concepts=file["concept_set_categories"][cat],
-								meta_columns=meta_columns,
-							)
+							out = map_file(grp, target_code_type, out, concepts=file["concept_set_categories"][cat], meta_columns=meta_columns,)
 
 		else:
 			print("Folder is empty")
 
 	# check if out is empty
 	if len(out) <= 0:
-		raise Exception("Output file is empty")
+		raise Exception("Output dataframe is empty")
 
 	# Final Processing
 	out = out.reset_index(drop=True)
@@ -340,11 +292,11 @@ def process(config_file, source_codes_dir, target_code_type, translate=True, ver
 	else:
 		# export as CSV to /output
 		out.to_csv(output_path, index=False)
-		print("saved to", output_path)
+		print("Saved to", output_path)
 
 	# Save Error File
-	if os.path.exists(log_errors_path):
-		error_df = pd.read_csv(log_errors_path)
+	if error_path.exists():	
+		error_df = pd.read_csv(error_path)
 		error_df = error_df.drop_duplicates()  # Remove Duplicates from Error file
 		error_df = error_df.sort_values(by=["SOURCE", "VOCABULARY", "CONCEPT"])
-		error_df.to_csv(log_errors_path, index=False)
+		error_df.to_csv(error_path, index=False)
diff --git a/parse.py b/parse.py
index 33907e6..ddf0748 100644
--- a/parse.py
+++ b/parse.py
@@ -68,13 +68,12 @@ class Proto_code():
 class Read2_code(Proto_code):
     def __init__(self, file_path=None):
         super().__init__(file_path)
+		
         input_path = trud.MAPS_PROCESSED_DIR / 'read2_code.parquet'
         if not input_path.is_file():  
             raise FileNotFoundError(f"Error: Read2 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")   
         self.db = pd.read_parquet(input_path)
-        self.arg_small = "-r2"
-        self.arg_long = "--read2-code"
-        self.arg_help = "Read V2 Codes Column name in Source File"
+		
         self.checks = [
 			(
 				"Not Empty",
@@ -115,9 +114,6 @@ class Read2_code(Proto_code):
 class Read3_code(Proto_code):
 	def __init__(self, file_path=None):
 		super().__init__(file_path)
-		self.arg_small = "-r3"
-		self.arg_long = "--read3-code"
-		self.arg_help = "Read V3 Codes Column name in Source File"
 
 		input_path = trud.MAPS_PROCESSED_DIR / 'read3_code.parquet'
 		if not input_path.is_file():  
@@ -163,9 +159,6 @@ class Read3_code(Proto_code):
 class Icd10_code(Proto_code):
 	def __init__(self, file_path=None):
 		super().__init__(file_path)
-		self.arg_small = "-i"
-		self.arg_long = "--icd10-code"
-		self.arg_help = "ICD10 Codes Column name in Source File"
 
 		input_path = trud.MAPS_PROCESSED_DIR / 'icd10_code.parquet'
 		if not input_path.is_file():  
@@ -229,9 +222,6 @@ class Icd10_code(Proto_code):
 class Snomed_code(Proto_code):
 	def __init__(self, file_path=None):
 		super().__init__(file_path)
-		self.arg_small = "-s"
-		self.arg_long = "--snomed-code"
-		self.arg_help = "SNOMED Codes Column name in Source File"
 
 		input_path = trud.MAPS_PROCESSED_DIR / 'snomed_code.parquet'
 		if not input_path.is_file():  
@@ -289,9 +279,6 @@ class Snomed_code(Proto_code):
 class Opcs4_code(Proto_code):
 	def __init__(self, file_path=None):
 		super().__init__(file_path)
-		self.arg_small = "-o"
-		self.arg_long = "--opcs4-code"
-		self.arg_help = "OPCS4 Codes Column name in Source File"
 
 		input_path = trud.MAPS_PROCESSED_DIR / 'opcs4_code.parquet'
 		if not input_path.is_file():  
@@ -317,9 +304,6 @@ class Opcs4_code(Proto_code):
 class Atc_code(Proto_code):
 	def __init__(self, file_path=None):
 		super().__init__(file_path)
-		self.arg_small = "-a"
-		self.arg_long = "--atc-code"
-		self.arg_help = "ATC Codes Column name in Source File"
 		self.checks = [
 			(
 				"Not Empty",
@@ -340,9 +324,6 @@ class Atc_code(Proto_code):
 class Med_code(Proto_code):
 	def __init__(self, file_path=None):
 		super().__init__(file_path)
-		self.arg_small = "-m"
-		self.arg_long = "--med-code"
-		self.arg_help = "Med Codes Column name in Source File"
 		self.checks = [
 			(
 				"Not Empty",
@@ -354,9 +335,6 @@ class Med_code(Proto_code):
 class Cprd_code(Proto_code):
 	def __init__(self, file_path=None):
 		super().__init__(file_path)
-		self.arg_small = "-c"
-		self.arg_long = "--cprd-code"
-		self.arg_help = "CPRD Product Codes Column name in Source File"
 		self.checks = [
 			(
 				"Not Empty",
@@ -366,14 +344,14 @@ class Cprd_code(Proto_code):
 		]
 		
 code_types = {
-	"read2_code": Read2_code,
-	"read3_code": Read3_code,
-	"icd10_code": Icd10_code,
-	"snomed_code": Snomed_code,
-	"opcs4_code": Opcs4_code,
-	"atc_code": Atc_code,
-	"med_code": Med_code,
-	"cprd_code": Cprd_code,
+	"read2": Read2_code,
+	"read3": Read3_code,
+	"icd10": Icd10_code,
+	"snomed": Snomed_code,
+	"opcs4": Opcs4_code,
+	"atc": Atc_code,
+	"med": Med_code,
+	"cprd": Cprd_code,
 }
 
 vocab_types = {
-- 
GitLab