moved publish.py to the output of the map function ratehr than a separate...

moved publish.py to the output of the map function ratehr than a separate operation so the individual files are always created when you run acmc map

moved publish.py to the output of the map function ratehr than a separate...
74638822 · mjbonifa · e5cbdbd2 · 74638822 · e5cbdbd2
Commit 74638822 authored 5 months ago by mjbonifa
--- a/phen.py
+++ b/phen.py
@@ -402,11 +402,7 @@ def sql_row_exist(conn, table, column, value):
    return exists
-def map(phen_dir,
+def map(phen_dir, target_code_type, translate=True, verify=True):
-		target_code_type,
-		translate=True,
-		verify=True):
 	print(f"Processing phenotype directory: {phen_dir}")
 	print(f"Target coding format: {target_code_type}")	
 	print(f"Translating: {translate}")
@@ -428,7 +424,7 @@ def map(phen_dir,
 	# Create output dataframe
 	out = pd.DataFrame([]) 
-	# Iterate JSON mapping file (OBJECT FORMAT)
+	# Process each folder in codes section
 	for folder in codes:
 		print(bcolors.HEADER, folder["description"], bcolors.ENDC)
 		if "files" in folder:
@@ -503,16 +499,16 @@ def map(phen_dir,
 		else:
 			print("Folder is empty")
-	# check if out is empty
+	# test if there's any output from processing
 	if len(out) <= 0:
-		raise Exception("Output dataframe is empty")
+		raise Exception("Processing has not produced any output")
-	# Final Processing
+	# Final processing
 	out = out.reset_index(drop=True)
 	out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
 	out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"])
-	# Add Concept Set Defintions metadata
+	# Add concept set definition metadata
 	concept_sets_df = pd.DataFrame(concept_sets["concept_set"])  # transform to dataframe
 	if "metadata" in concept_sets_df.columns:
 		concept_sets_df = concept_sets_df.join(pd.json_normalize(concept_sets_df["metadata"]))  # metadata to columns
@@ -521,9 +517,8 @@ def map(phen_dir,
 	concept_sets_df = concept_sets_df.drop_duplicates()  # remove duplicates
 	out = out.merge(concept_sets_df, how="left", on="CONCEPT_SET")  # merge with output
-	# Save Output File
+	# Save output to map directory
 	print(bcolors.HEADER, "---" * 5, "OUTPUT", "---" * 5, bcolors.ENDC)
-	print(out)
 	if translate:
 		output_filename = target_code_type + '.csv'
@@ -533,9 +528,9 @@ def map(phen_dir,
 	map_path = phen_path / MAP_DIR / output_filename
 	out.to_csv(map_path, index=False)
-	print("Saved translations to", map_path)
+	print(f"Saved mapped concepts to {str(map_path.resolve())}")	
-	# Save Error File
+	# save error File
 	error_path = phen_path / ERROR_FILE
 	if error_path.exists():	
 		error_df = pd.read_csv(error_path)
@@ -543,6 +538,29 @@ def map(phen_dir,
 		error_df = error_df.sort_values(by=["SOURCE", "VOCABULARY", "CONCEPT"])
 		error_df.to_csv(error_path, index=False)
+	# save concept sets as separate files
+	concept_set_path = phen_path / CONCEPT_SET_DIR / target_code_type
+	# empty the concept-set directory if it exists but keep the .git file
+	git_items = ['.git', '.gitkeep']
+	if concept_set_path.exists():
+		for item in concept_set_path.iterdir():
+			if item not in git_items:
+				item.unlink()
+	else:
+		concept_set_path.mkdir(parents=True, exist_ok=True)
+	# write each concept as a separate file
+	for name, concept in out.groupby("CONCEPT_SET"):
+		concept = concept.sort_values(by="CONCEPT") #sort rows
+		concept = concept.dropna(how='all', axis=1)  #remove empty cols
+		concept = concept.reindex(sorted(concept.columns), axis=1) #sort cols alphabetically
+		filename = f"{name}.csv"
+		concept_path = concept_set_path / filename
+		concept.to_csv(concept_path, index=False )
+	print(f"Saved concept_sets to {str(concept_set_path.resolve())}")	
 	print(f"Phenotype processing completed")
 def publish(phen_dir):

--- a/publish.py
+++ b/publish.py
-import os
-import pandas as pd
-import argparse
-def main(config):
-	#Load Output Concepts CSV File
-	if config["concepts"].endswith(".csv"):
-		df = pd.read_csv(config["concepts"], dtype=str)
-	else:
-		raise Exception("Concepts file must be '.csv' filetype")
-	for name, concept in df.groupby("CONCEPT_SET"):
-		concept = concept.sort_values(by="CONCEPT") #sort rows
-		concept = concept.dropna(how='all', axis=1)  #remove empty cols
-		concept = concept.reindex(sorted(concept.columns), axis=1) #sort cols alphabetically
-		concept.to_csv(os.path.join(config["output"], str(name)+".csv"), #save to csv
-					   index=False )
-if __name__ == '__main__':
-	parser = argparse.ArgumentParser(description="Script divides single CSV file into one CSV per concept",
-									 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-	parser.add_argument("concepts", help="Output Concepts CSV file")
-	parser.add_argument("output", help="Output Folder")
-	# parser.add_argument("version", help="Version of output")
-	args = parser.parse_args()
-	config = vars(args)
-	main(config)