diff --git a/phen.py b/phen.py index 321a71cfc0b619e22820e3b97bfaa68657bbc8d3..f6c99fe57eb2929c259a42f79a12c40940ba2691 100644 --- a/phen.py +++ b/phen.py @@ -402,11 +402,7 @@ def sql_row_exist(conn, table, column, value): return exists -def map(phen_dir, - target_code_type, - translate=True, - verify=True): - +def map(phen_dir, target_code_type, translate=True, verify=True): print(f"Processing phenotype directory: {phen_dir}") print(f"Target coding format: {target_code_type}") print(f"Translating: {translate}") @@ -428,7 +424,7 @@ def map(phen_dir, # Create output dataframe out = pd.DataFrame([]) - # Iterate JSON mapping file (OBJECT FORMAT) + # Process each folder in codes section for folder in codes: print(bcolors.HEADER, folder["description"], bcolors.ENDC) if "files" in folder: @@ -503,16 +499,16 @@ def map(phen_dir, else: print("Folder is empty") - # check if out is empty + # test if there's any output from processing if len(out) <= 0: - raise Exception("Output dataframe is empty") + raise Exception("Processing has not produced any output") - # Final Processing + # Final processing out = out.reset_index(drop=True) out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"]) out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"]) - # Add Concept Set Defintions metadata + # Add concept set definition metadata concept_sets_df = pd.DataFrame(concept_sets["concept_set"]) # transform to dataframe if "metadata" in concept_sets_df.columns: concept_sets_df = concept_sets_df.join(pd.json_normalize(concept_sets_df["metadata"])) # metadata to columns @@ -521,9 +517,8 @@ def map(phen_dir, concept_sets_df = concept_sets_df.drop_duplicates() # remove duplicates out = out.merge(concept_sets_df, how="left", on="CONCEPT_SET") # merge with output - # Save Output File + # Save output to map directory print(bcolors.HEADER, "---" * 5, "OUTPUT", "---" * 5, bcolors.ENDC) - print(out) if translate: output_filename = target_code_type + '.csv' @@ -533,9 +528,9 @@ def map(phen_dir, map_path = phen_path / MAP_DIR / output_filename out.to_csv(map_path, index=False) - print("Saved translations to", map_path) + print(f"Saved mapped concepts to {str(map_path.resolve())}") - # Save Error File + # save error File error_path = phen_path / ERROR_FILE if error_path.exists(): error_df = pd.read_csv(error_path) @@ -543,6 +538,29 @@ def map(phen_dir, error_df = error_df.sort_values(by=["SOURCE", "VOCABULARY", "CONCEPT"]) error_df.to_csv(error_path, index=False) + # save concept sets as separate files + concept_set_path = phen_path / CONCEPT_SET_DIR / target_code_type + + # empty the concept-set directory if it exists but keep the .git file + git_items = ['.git', '.gitkeep'] + if concept_set_path.exists(): + for item in concept_set_path.iterdir(): + if item not in git_items: + item.unlink() + else: + concept_set_path.mkdir(parents=True, exist_ok=True) + + # write each concept as a separate file + for name, concept in out.groupby("CONCEPT_SET"): + concept = concept.sort_values(by="CONCEPT") #sort rows + concept = concept.dropna(how='all', axis=1) #remove empty cols + concept = concept.reindex(sorted(concept.columns), axis=1) #sort cols alphabetically + filename = f"{name}.csv" + concept_path = concept_set_path / filename + concept.to_csv(concept_path, index=False ) + + print(f"Saved concept_sets to {str(concept_set_path.resolve())}") + print(f"Phenotype processing completed") def publish(phen_dir): diff --git a/publish.py b/publish.py deleted file mode 100644 index f39a2316fa845dd0f112abdee16a2fa6fbf22da9..0000000000000000000000000000000000000000 --- a/publish.py +++ /dev/null @@ -1,31 +0,0 @@ -import os -import pandas as pd -import argparse - -def main(config): - #Load Output Concepts CSV File - if config["concepts"].endswith(".csv"): - df = pd.read_csv(config["concepts"], dtype=str) - else: - raise Exception("Concepts file must be '.csv' filetype") - - for name, concept in df.groupby("CONCEPT_SET"): - concept = concept.sort_values(by="CONCEPT") #sort rows - concept = concept.dropna(how='all', axis=1) #remove empty cols - concept = concept.reindex(sorted(concept.columns), axis=1) #sort cols alphabetically - - concept.to_csv(os.path.join(config["output"], str(name)+".csv"), #save to csv - index=False ) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Script divides single CSV file into one CSV per concept", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - parser.add_argument("concepts", help="Output Concepts CSV file") - parser.add_argument("output", help="Output Folder") - # parser.add_argument("version", help="Version of output") - - args = parser.parse_args() - config = vars(args) - main(config)