Skip to content
Snippets Groups Projects
Commit 74638822 authored by mjbonifa's avatar mjbonifa
Browse files

moved publish.py to the output of the map function ratehr than a separate...

moved publish.py to the output of the map function ratehr than a separate operation so the individual files are always created when you run acmc map
parent e5cbdbd2
No related branches found
No related tags found
No related merge requests found
...@@ -402,11 +402,7 @@ def sql_row_exist(conn, table, column, value): ...@@ -402,11 +402,7 @@ def sql_row_exist(conn, table, column, value):
return exists return exists
def map(phen_dir, def map(phen_dir, target_code_type, translate=True, verify=True):
target_code_type,
translate=True,
verify=True):
print(f"Processing phenotype directory: {phen_dir}") print(f"Processing phenotype directory: {phen_dir}")
print(f"Target coding format: {target_code_type}") print(f"Target coding format: {target_code_type}")
print(f"Translating: {translate}") print(f"Translating: {translate}")
...@@ -428,7 +424,7 @@ def map(phen_dir, ...@@ -428,7 +424,7 @@ def map(phen_dir,
# Create output dataframe # Create output dataframe
out = pd.DataFrame([]) out = pd.DataFrame([])
# Iterate JSON mapping file (OBJECT FORMAT) # Process each folder in codes section
for folder in codes: for folder in codes:
print(bcolors.HEADER, folder["description"], bcolors.ENDC) print(bcolors.HEADER, folder["description"], bcolors.ENDC)
if "files" in folder: if "files" in folder:
...@@ -503,16 +499,16 @@ def map(phen_dir, ...@@ -503,16 +499,16 @@ def map(phen_dir,
else: else:
print("Folder is empty") print("Folder is empty")
# check if out is empty # test if there's any output from processing
if len(out) <= 0: if len(out) <= 0:
raise Exception("Output dataframe is empty") raise Exception("Processing has not produced any output")
# Final Processing # Final processing
out = out.reset_index(drop=True) out = out.reset_index(drop=True)
out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"]) out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"]) out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"])
# Add Concept Set Defintions metadata # Add concept set definition metadata
concept_sets_df = pd.DataFrame(concept_sets["concept_set"]) # transform to dataframe concept_sets_df = pd.DataFrame(concept_sets["concept_set"]) # transform to dataframe
if "metadata" in concept_sets_df.columns: if "metadata" in concept_sets_df.columns:
concept_sets_df = concept_sets_df.join(pd.json_normalize(concept_sets_df["metadata"])) # metadata to columns concept_sets_df = concept_sets_df.join(pd.json_normalize(concept_sets_df["metadata"])) # metadata to columns
...@@ -521,9 +517,8 @@ def map(phen_dir, ...@@ -521,9 +517,8 @@ def map(phen_dir,
concept_sets_df = concept_sets_df.drop_duplicates() # remove duplicates concept_sets_df = concept_sets_df.drop_duplicates() # remove duplicates
out = out.merge(concept_sets_df, how="left", on="CONCEPT_SET") # merge with output out = out.merge(concept_sets_df, how="left", on="CONCEPT_SET") # merge with output
# Save Output File # Save output to map directory
print(bcolors.HEADER, "---" * 5, "OUTPUT", "---" * 5, bcolors.ENDC) print(bcolors.HEADER, "---" * 5, "OUTPUT", "---" * 5, bcolors.ENDC)
print(out)
if translate: if translate:
output_filename = target_code_type + '.csv' output_filename = target_code_type + '.csv'
...@@ -533,9 +528,9 @@ def map(phen_dir, ...@@ -533,9 +528,9 @@ def map(phen_dir,
map_path = phen_path / MAP_DIR / output_filename map_path = phen_path / MAP_DIR / output_filename
out.to_csv(map_path, index=False) out.to_csv(map_path, index=False)
print("Saved translations to", map_path) print(f"Saved mapped concepts to {str(map_path.resolve())}")
# Save Error File # save error File
error_path = phen_path / ERROR_FILE error_path = phen_path / ERROR_FILE
if error_path.exists(): if error_path.exists():
error_df = pd.read_csv(error_path) error_df = pd.read_csv(error_path)
...@@ -543,6 +538,29 @@ def map(phen_dir, ...@@ -543,6 +538,29 @@ def map(phen_dir,
error_df = error_df.sort_values(by=["SOURCE", "VOCABULARY", "CONCEPT"]) error_df = error_df.sort_values(by=["SOURCE", "VOCABULARY", "CONCEPT"])
error_df.to_csv(error_path, index=False) error_df.to_csv(error_path, index=False)
# save concept sets as separate files
concept_set_path = phen_path / CONCEPT_SET_DIR / target_code_type
# empty the concept-set directory if it exists but keep the .git file
git_items = ['.git', '.gitkeep']
if concept_set_path.exists():
for item in concept_set_path.iterdir():
if item not in git_items:
item.unlink()
else:
concept_set_path.mkdir(parents=True, exist_ok=True)
# write each concept as a separate file
for name, concept in out.groupby("CONCEPT_SET"):
concept = concept.sort_values(by="CONCEPT") #sort rows
concept = concept.dropna(how='all', axis=1) #remove empty cols
concept = concept.reindex(sorted(concept.columns), axis=1) #sort cols alphabetically
filename = f"{name}.csv"
concept_path = concept_set_path / filename
concept.to_csv(concept_path, index=False )
print(f"Saved concept_sets to {str(concept_set_path.resolve())}")
print(f"Phenotype processing completed") print(f"Phenotype processing completed")
def publish(phen_dir): def publish(phen_dir):
......
import os
import pandas as pd
import argparse
def main(config):
#Load Output Concepts CSV File
if config["concepts"].endswith(".csv"):
df = pd.read_csv(config["concepts"], dtype=str)
else:
raise Exception("Concepts file must be '.csv' filetype")
for name, concept in df.groupby("CONCEPT_SET"):
concept = concept.sort_values(by="CONCEPT") #sort rows
concept = concept.dropna(how='all', axis=1) #remove empty cols
concept = concept.reindex(sorted(concept.columns), axis=1) #sort cols alphabetically
concept.to_csv(os.path.join(config["output"], str(name)+".csv"), #save to csv
index=False )
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Script divides single CSV file into one CSV per concept",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("concepts", help="Output Concepts CSV file")
parser.add_argument("output", help="Output Folder")
# parser.add_argument("version", help="Version of output")
args = parser.parse_args()
config = vars(args)
main(config)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment