diff --git a/base.py b/base.py index 6e7076326e3e03ca806342d0cf32902915c519d1..97657274a25e381838ff76872643d42964e39852 100644 --- a/base.py +++ b/base.py @@ -1,8 +1,8 @@ import pandas as pd import os -log_errors_path = "output/MELD_errors.csv" -output_path = "output/MELD_concepts_read.csv" +#TODO: set error file location from cmd +log_errors_path = "MELD_errors.csv" class bcolors: #for printing coloured text HEADER = '\033[95m' @@ -17,7 +17,9 @@ class bcolors: #for printing coloured text def raise_(ex): raise ex - + + + def log_invalid_code(codes, mask, code_type=None, file_path=None, cause=None): print("ERROR WITH CODES", file_path, codes[~mask]) diff --git a/main.py b/main.py index 3c789e5f96ef3367d69493d43264fc780bb464bc..b089cf6fbc110fa74b328174c105862d930e7227 100644 --- a/main.py +++ b/main.py @@ -5,8 +5,6 @@ import numpy as np import json import os -from base import output_path -from base import log_errors_path from base import log_invalid_code from base import bcolors from base import raise_ @@ -22,7 +20,7 @@ from parse import code_types pd.set_option('mode.chained_assignment', None) -def read_code_file(path, excel_sheet=None): +def read_table_file(path, excel_sheet=None): """ Load Code List File """ @@ -142,16 +140,21 @@ def map_file(df, target_code_type, out, concepts, meta_columns=[], no_translate= out = pd.concat([out, codes]) return out -def main(config): +def run_all(mapping_file, target_code_type, + no_translate=False, no_verify=False, + log_errors_path="MELD_errors.csv", + output_path="MELD_concepts_read.csv"): + #Load Mapping File - if config["map"].endswith(".json"): - folders = json.load(open(config["map"],'rb')) + if mapping_file.endswith(".json"): + mapping = json.load(open(mapping_file,'rb')) + folders = mapping["codes"] else: raise Exception("Unsupported filetype provided for source file") out = pd.DataFrame([]) #Create Output File to append to - #Iteratie JSON mapping file (OBJECT FORMAT) + #Iterate JSON mapping file (OBJECT FORMAT) for folder in folders: print(bcolors.HEADER, folder["description"], bcolors.ENDC) if "files" in folder: @@ -161,10 +164,10 @@ def main(config): #Load Code File if "excel_sheet" in file: - df = read_code_file(path=file_path, + df = read_table_file(path=file_path, excel_sheet = file["excel_sheet"]) else: - df = read_code_file(path=file_path) + df = read_table_file(path=file_path) #Perform Structural Changes to file before preprocessing #split column with multiple code types @@ -178,15 +181,6 @@ def main(config): oh = oh.where((oh != True), codes, axis=0) #fill in 1s with codes oh[oh == False] = np.NaN #replace 0s with None df = pd.concat([df, oh], axis=1) #merge in new columns - - #check which code type is the target - specified = False - for k in code_types.keys(): - if config[k]: - specified = True - target_code_type = k - if not specified: - raise Exception("Specify target code type") #Preprocessing & Validation Checks if "columns" in file: @@ -201,8 +195,8 @@ def main(config): meta_columns=meta_columns, file_path=file_path, target_code_type=target_code_type, - no_verify=config["no_verify"], - no_translate=config["no_translate"]) + no_verify=no_verify, + no_translate=no_translate) else: raise Exception("No column format provided") @@ -223,7 +217,7 @@ def main(config): out, concepts=file["meldb_phenotypes"], meta_columns=meta_columns, - no_translate=config["no_translate"]) + no_translate=no_translate) elif ("meldb_phenotypes_categories" in file) and isinstance(df, pd.core.groupby.generic.DataFrameGroupBy): meta_columns.remove(divide_col) #delete categorical column for cat, grp in df: @@ -247,15 +241,22 @@ def main(config): out = out.sort_values(by=["MELDB_concept", "code"]) #Merge with Concept Types in Summary Excel File - if config["summary"].endswith(".xlsx"): - summary = pd.read_excel(config["summary"], sheet_name="CONCEPT_TRACKING", dtype=str) + summary_config = mapping["concepts"] + if "excel_sheet" in summary_config: + summary_df = read_table_file(summary_config["file"], excel_sheet=summary_config["excel_sheet"]) else: - raise Exception("Unsupported filetype provided for summary file") - summary = summary[["CONCEPT NAME ", "CONCEPT TYPE"]] #select columns - summary = summary.loc[1:] #drop first row (labels) - summary = summary.rename(columns={"CONCEPT NAME ": "MELDB_concept"}) - summary = summary.drop_duplicates() #remove duplicates - out = out.merge(summary, how="left", on='MELDB_concept') + summary_df = read_table_file(summary_config["file"]) + summary_cols_all = [] #get all column names + for v in summary_config["columns"].values(): #TODO: put in seperate function - get all columns in JSON file object + if type(v) == str: + summary_cols_all.append(v) + else: + summary_cols_all += v + + summary_df = summary_df[summary_cols_all] #select all relevant columns + summary_df = summary_df.rename(columns={summary_config["columns"]["concept_name"]: "MELDB_concept"}) + summary_df = summary_df.drop_duplicates() #remove duplicates + out = out.merge(summary_df, how="left", on='MELDB_concept') # export as CSV to /output print(bcolors.HEADER, "---"*5, "OUTPUT", "---"*5, bcolors.ENDC) @@ -265,10 +266,10 @@ def main(config): #Remove Duplicates from Error file if os.path.exists(log_errors_path): - df_error = pd.read_csv(log_errors_path) - df_error = df_error.drop_duplicates() - df_error = df_error.sort_values(by=["SOURCE", "CODE_TYPE", "CODE"]) - df_error.to_csv(log_errors_path, index=False) + error_df = pd.read_csv(log_errors_path) + error_df = error_df.drop_duplicates() + error_df = error_df.sort_values(by=["SOURCE", "CODE_TYPE", "CODE"]) + error_df.to_csv(log_errors_path, index=False) if __name__ == '__main__': @@ -280,11 +281,35 @@ if __name__ == '__main__': parser.add_argument(code_type.arg_small, code_type.arg_long, action='store_true', help=code_type.arg_help) - parser.add_argument("map", help="Concept/Phenotype Assignment File (json)") - parser.add_argument("summary", help="Summary working excel document") + parser.add_argument("mapping_file", help="Concept/Phenotype Assignment File (json)") parser.add_argument("--no-translate", action='store_true', help="Do not translate code types") parser.add_argument("--no-verify", action='store_true', help="Do not verify codes are correct") + parser.add_argument("--output", type=str, help="File Location to save output csv to") + parser.add_argument("--error-log", type=str, help="File Location to save output csv to") args = parser.parse_args() config = vars(args) - main(config) \ No newline at end of file + + #Check which code type is the target + specified = False + for k in code_types.keys(): + if config[k]: + specified = True + target_code_type = k + if not specified: + raise Exception("Specify target code type") + + #Format Arguments for python function + params={} + #Required Params + params["mapping_file"] = config["mapping_file"] if "mapping_file" in config else Exception("Must specify Location of JSON Mapping File") + params["target_code_type"] = target_code_type + #Optional Params + params["no_translate"] = config["no_translate"] + params["no_verify"] = config["no_verify"] + if not config["output"] == None : + params["output_path"] = config["output"] + if not config["error_log"] == None: + params["log_errors_path"] = config["error_log"] + + run_all(**params) \ No newline at end of file diff --git a/parse.py b/parse.py index abc3bc98b9a56439bad0bfb02c3331cc0f914367..1c7e1a9022fd6866722d8e80891f710b2bcba2d0 100644 --- a/parse.py +++ b/parse.py @@ -2,8 +2,6 @@ import pandas as pd import numpy as np import os -from base import output_path -from base import log_errors_path from base import log_invalid_code from base import bcolors from base import raise_