Skip to content
Snippets Groups Projects
Commit 4929dd79 authored by Jakub Dylag's avatar Jakub Dylag
Browse files

Summary Sheet to CSV and filepath in JSON file

parent 12c11d66
No related branches found
No related tags found
No related merge requests found
import pandas as pd
import os
log_errors_path = "output/MELD_errors.csv"
output_path = "output/MELD_concepts_read.csv"
#TODO: set error file location from cmd
log_errors_path = "MELD_errors.csv"
class bcolors: #for printing coloured text
HEADER = '\033[95m'
......@@ -18,6 +18,8 @@ class bcolors: #for printing coloured text
def raise_(ex):
raise ex
def log_invalid_code(codes, mask, code_type=None, file_path=None, cause=None):
print("ERROR WITH CODES", file_path, codes[~mask])
......
......@@ -5,8 +5,6 @@ import numpy as np
import json
import os
from base import output_path
from base import log_errors_path
from base import log_invalid_code
from base import bcolors
from base import raise_
......@@ -22,7 +20,7 @@ from parse import code_types
pd.set_option('mode.chained_assignment', None)
def read_code_file(path, excel_sheet=None):
def read_table_file(path, excel_sheet=None):
"""
Load Code List File
"""
......@@ -142,16 +140,21 @@ def map_file(df, target_code_type, out, concepts, meta_columns=[], no_translate=
out = pd.concat([out, codes])
return out
def main(config):
def run_all(mapping_file, target_code_type,
no_translate=False, no_verify=False,
log_errors_path="MELD_errors.csv",
output_path="MELD_concepts_read.csv"):
#Load Mapping File
if config["map"].endswith(".json"):
folders = json.load(open(config["map"],'rb'))
if mapping_file.endswith(".json"):
mapping = json.load(open(mapping_file,'rb'))
folders = mapping["codes"]
else:
raise Exception("Unsupported filetype provided for source file")
out = pd.DataFrame([]) #Create Output File to append to
#Iteratie JSON mapping file (OBJECT FORMAT)
#Iterate JSON mapping file (OBJECT FORMAT)
for folder in folders:
print(bcolors.HEADER, folder["description"], bcolors.ENDC)
if "files" in folder:
......@@ -161,10 +164,10 @@ def main(config):
#Load Code File
if "excel_sheet" in file:
df = read_code_file(path=file_path,
df = read_table_file(path=file_path,
excel_sheet = file["excel_sheet"])
else:
df = read_code_file(path=file_path)
df = read_table_file(path=file_path)
#Perform Structural Changes to file before preprocessing
#split column with multiple code types
......@@ -179,15 +182,6 @@ def main(config):
oh[oh == False] = np.NaN #replace 0s with None
df = pd.concat([df, oh], axis=1) #merge in new columns
#check which code type is the target
specified = False
for k in code_types.keys():
if config[k]:
specified = True
target_code_type = k
if not specified:
raise Exception("Specify target code type")
#Preprocessing & Validation Checks
if "columns" in file:
meta_columns=[] #meta columns to keep with codes
......@@ -201,8 +195,8 @@ def main(config):
meta_columns=meta_columns,
file_path=file_path,
target_code_type=target_code_type,
no_verify=config["no_verify"],
no_translate=config["no_translate"])
no_verify=no_verify,
no_translate=no_translate)
else:
raise Exception("No column format provided")
......@@ -223,7 +217,7 @@ def main(config):
out,
concepts=file["meldb_phenotypes"],
meta_columns=meta_columns,
no_translate=config["no_translate"])
no_translate=no_translate)
elif ("meldb_phenotypes_categories" in file) and isinstance(df, pd.core.groupby.generic.DataFrameGroupBy):
meta_columns.remove(divide_col) #delete categorical column
for cat, grp in df:
......@@ -247,15 +241,22 @@ def main(config):
out = out.sort_values(by=["MELDB_concept", "code"])
#Merge with Concept Types in Summary Excel File
if config["summary"].endswith(".xlsx"):
summary = pd.read_excel(config["summary"], sheet_name="CONCEPT_TRACKING", dtype=str)
summary_config = mapping["concepts"]
if "excel_sheet" in summary_config:
summary_df = read_table_file(summary_config["file"], excel_sheet=summary_config["excel_sheet"])
else:
summary_df = read_table_file(summary_config["file"])
summary_cols_all = [] #get all column names
for v in summary_config["columns"].values(): #TODO: put in seperate function - get all columns in JSON file object
if type(v) == str:
summary_cols_all.append(v)
else:
raise Exception("Unsupported filetype provided for summary file")
summary = summary[["CONCEPT NAME ", "CONCEPT TYPE"]] #select columns
summary = summary.loc[1:] #drop first row (labels)
summary = summary.rename(columns={"CONCEPT NAME ": "MELDB_concept"})
summary = summary.drop_duplicates() #remove duplicates
out = out.merge(summary, how="left", on='MELDB_concept')
summary_cols_all += v
summary_df = summary_df[summary_cols_all] #select all relevant columns
summary_df = summary_df.rename(columns={summary_config["columns"]["concept_name"]: "MELDB_concept"})
summary_df = summary_df.drop_duplicates() #remove duplicates
out = out.merge(summary_df, how="left", on='MELDB_concept')
# export as CSV to /output
print(bcolors.HEADER, "---"*5, "OUTPUT", "---"*5, bcolors.ENDC)
......@@ -265,10 +266,10 @@ def main(config):
#Remove Duplicates from Error file
if os.path.exists(log_errors_path):
df_error = pd.read_csv(log_errors_path)
df_error = df_error.drop_duplicates()
df_error = df_error.sort_values(by=["SOURCE", "CODE_TYPE", "CODE"])
df_error.to_csv(log_errors_path, index=False)
error_df = pd.read_csv(log_errors_path)
error_df = error_df.drop_duplicates()
error_df = error_df.sort_values(by=["SOURCE", "CODE_TYPE", "CODE"])
error_df.to_csv(log_errors_path, index=False)
if __name__ == '__main__':
......@@ -280,11 +281,35 @@ if __name__ == '__main__':
parser.add_argument(code_type.arg_small, code_type.arg_long,
action='store_true',
help=code_type.arg_help)
parser.add_argument("map", help="Concept/Phenotype Assignment File (json)")
parser.add_argument("summary", help="Summary working excel document")
parser.add_argument("mapping_file", help="Concept/Phenotype Assignment File (json)")
parser.add_argument("--no-translate", action='store_true', help="Do not translate code types")
parser.add_argument("--no-verify", action='store_true', help="Do not verify codes are correct")
parser.add_argument("--output", type=str, help="File Location to save output csv to")
parser.add_argument("--error-log", type=str, help="File Location to save output csv to")
args = parser.parse_args()
config = vars(args)
main(config)
\ No newline at end of file
#Check which code type is the target
specified = False
for k in code_types.keys():
if config[k]:
specified = True
target_code_type = k
if not specified:
raise Exception("Specify target code type")
#Format Arguments for python function
params={}
#Required Params
params["mapping_file"] = config["mapping_file"] if "mapping_file" in config else Exception("Must specify Location of JSON Mapping File")
params["target_code_type"] = target_code_type
#Optional Params
params["no_translate"] = config["no_translate"]
params["no_verify"] = config["no_verify"]
if not config["output"] == None :
params["output_path"] = config["output"]
if not config["error_log"] == None:
params["log_errors_path"] = config["error_log"]
run_all(**params)
\ No newline at end of file
......@@ -2,8 +2,6 @@ import pandas as pd
import numpy as np
import os
from base import output_path
from base import log_errors_path
from base import log_invalid_code
from base import bcolors
from base import raise_
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment