Skip to content
Snippets Groups Projects
Commit 4f78ba85 authored by Jakub Dylag's avatar Jakub Dylag
Browse files

Convertion script - allow multiple files per concept set

parent 62e98fc3
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import yaml import yaml
import json import json
from pathlib import Path from pathlib import Path
import pandas as pd import pandas as pd
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
json_file = "PHEN_assign_v3.json" json_file = "PHEN_assign_v3.json"
yaml_path = "workspace/phen/config.yml" yaml_path = "config.yml"
source_folder_path = "workspace/phen/concepts" source_folder_path = "concepts"
outs = {} outs = []
# Read the JSON file # Read the JSON file
with open(json_file, 'r', encoding='utf-8') as file: with open(json_file, 'r', encoding='utf-8') as file:
data = json.load(file) data = json.load(file)
def add_conc(outs, name, path, columns, category=None, metadata=None): def add_conc(outs, name, path, columns, category=None, actions=None, #metacol=None
):
#TODO: acmc handle empty conceptset when all QA fail
if name == "PLASMACELL": if name == "PLASMACELL":
return outs return outs
out = { out = {
"name":str(name), "name":str(name),
"file":{ "files":{
"path":str(path), "path":str(path).replace("\\", '/'),
"columns":columns, "columns":columns,
}, },
} }
if category is not None: #divide_col
out["file"]["category"]=str(category) if (category is not None) and (actions is not None):
if metadata is not None: print("divide_col", category, actions)
out["metadata"]=metadata out["files"]["category"]=str(category)
out["files"]["actions"] = {}
out["files"]["actions"]["divide_col"] = actions["divide_col"]
#split_col
elif (actions is not None):
print("split_col", actions)
out["files"]["actions"] = {}
out["files"]["actions"]["split_col"] = actions["split_col"]
out["files"]["actions"]["codes_col"] = actions["codes_col"]
# if metacol is not None:
# out["metacol"]=metacol
outs.append(out) outs.append(out)
return outs return outs
outs = []
for folder in data["codes"]: for folder in data["codes"]:
folder_path = folder["folder"] folder_path = folder["folder"]
for files in folder["files"]: for file in folder["files"]:
#TODO: actions divide_col #TODO: actions divide_col
#TODO: save metadata - has to be dict not list?
#Columns #Columns
col_out = {} col_out = {}
for k,v in files["columns"].items(): for k,v in file["columns"].items():
supported = ["read2"] supported = ["read2", "read3", "icd10", "snomed", "opcs4", "atc"]
if type(v) == str and k[:-5] in supported: if type(v) == str and k[:-5] in supported:
col_out[k[:-5]] = v col_out[k[:-5]] = v
#Metadata #Metacolumn
# if "metadata" in files["columns"]: # if "metadata" in file["columns"]:
# meta = dict(files["columns"]["metadata"]) # meta = dict(file["columns"]["metadata"])
# else: # else:
# meta = None # meta = None
#File Path #File Path
path = folder["folder"][6:]+"/"+files["file"] new_folder_path = Path(folder["folder"][6:].replace('\\','/'))
new_file_path = Path(file["file"])
path = Path(new_folder_path / new_file_path)
#Convert XLSX to CSV File
if "excel_sheet" in file.keys():
# print("Converted Excel", path)
df_xlsx = pd.read_excel(Path(source_folder_path / path), sheet_name=file["excel_sheet"])
save_path = Path(source_folder_path / path).with_suffix(".csv")
path = Path(path).with_suffix(".csv")
# df_xlsx.to_csv(save_path) #TODO: uncomment
if "actions" in files.keys(): if "actions" in file.keys():
pass
#split_col
# if
#divide_col #divide_col
# elif "concept_set_categories" in files: if "concept_set_categories" in file:
# for cat, name in files["concept_set_categories"].items(): for cat, name in file["concept_set_categories"].items():
# print(col_out) outs = add_conc(
# outs = add_conc( outs,
# outs, name = name[0],
# name = name, category = cat,
# category = cat, actions = file["actions"],
# path=path, path=path,
# columns = {"read2":"Read Code"}, #TODO: fix bodged columns = col_out, #TODO: fix bodged
# metadata = {} # metacol = meta
# ) )
elif "excel_sheet" in files.keys(): #split_col
#Convert XLSX to CSV File else:
print("Converted Excel", path) for name in file["concept_set"]: #If belongs to multiple
df_xlsx = pd.read_excel(source_folder_path+"/"+path, sheet_name=files["excel_sheet"]) outs = add_conc(
path = Path(source_folder_path+"/"+path).with_suffix(".csv") outs,
df_xlsx.to_csv(path) name=str(name),
path=path,
columns = col_out,
actions=file["actions"],
# metacol = meta
)
elif "concept_set" in file:
#Add multiple concept sets to yaml #Add multiple concept sets to yaml
for name in files["concept_set"]: #If belongs to multiple for name in file["concept_set"]: #If belongs to multiple
outs = add_conc( outs = add_conc(
outs, outs,
name=str(name), name=str(name),
path=path, path=path,
columns = col_out, columns = col_out,
metadata = {}, # metacol = meta
# metadata = meta
) )
elif "concept_set" in files: outs = pd.DataFrame(outs)
#Add multiple concept sets to yaml display(outs)
for name in files["concept_set"]: #If belongs to multiple # print(len(outs.groupby("name")), "have files, out of", len(data["concept_sets"]["concept_set"]), "defined")
outs = add_conc(
outs, final_out = []
name=str(name), for name, grp in outs.groupby("name"):
path=path, out = {}
columns = col_out, out["name"]=name
metadata = {},
# metadata = meta out["files"]=list(grp["files"] )
)
for conc in data["concept_sets"]["concept_set"]:
if conc["concept_set_name"] == name:
metadata=conc["metadata"]
break
out["metadata"]=dict(metadata)
final_out.append(out)
print(len(final_out), "in yaml")
#Add Metadata for each concept set
# for conc in data["concept_sets"]["concept_set"]: #iterate concept set definitions
# conc_name = conc["concept_set_name"]
# metadata = conc["metadata"]
# #Look for matching concept set in output
# for c in outs:
# if c["name"] == conc_name:
# c["metadata"] = dict(metadata) #append metadata
#Remove "PLASMACELL" concept set #Remove "PLASMACELL" concept set
outs = [(o) for o in outs if o["name"] != "PLASMACELL"] # outs = [(o) for o in outs if o["name"] != "PLASMACELL"]
final = { final = {
"phenotype":{ "phenotype":{
"version": "4.0.0", "version": "4.0.0",
"omop":{ "omop":{
"vocabulary_id": "MELDB_SAIL", "vocabulary_id": "MELDB_SAIL",
"vocabulary_name": "Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity", "vocabulary_name": "Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity",
"vocabulary_reference": "https://www.it-innovation.soton.ac.uk/projects/meldb", "vocabulary_reference": "https://www.it-innovation.soton.ac.uk/projects/meldb",
}, },
"map":["read2", "read3", "icd10", "snomed", "opcs4", "atc"], "map":["read2", "read3", "icd10", "snomed", "opcs4", "atc"],
"concept_sets":outs, "concept_sets":final_out,
}, },
} }
yaml.Dumper.ignore_aliases = lambda *args : True #remove unwanted pointers yaml.Dumper.ignore_aliases = lambda *args : True #remove unwanted pointers
# Convert and write to YAML # Convert and write to YAML
with open(yaml_path, 'w', encoding='utf-8') as file: with open(yaml_path, 'w', encoding='utf-8') as file:
yaml.dump(dict(final), file, default_flow_style=False, allow_unicode=True) yaml.dump(dict(final), file, default_flow_style=False, allow_unicode=True)
``` ```
%% Output
split_col {'split_col': 'coding_system', 'codes_col': 'code'}
split_col {'split_col': 'coding_system', 'codes_col': 'code'}
split_col {'split_col': 'coding_system', 'codes_col': 'code'}
split_col {'split_col': 'coding_system', 'codes_col': 'code'}
split_col {'split_col': 'coding_system', 'codes_col': 'code'}
divide_col 13 {'divide_col': 'MMCode'}
divide_col 22 {'divide_col': 'MMCode'}
divide_col 5 {'divide_col': 'MMCode'}
divide_col 33 {'divide_col': 'MMCode'}
divide_col 37 {'divide_col': 'MMCode'}
divide_col 41 {'divide_col': 'MMCode'}
divide_col 34 {'divide_col': 'MMCode'}
divide_col 12 {'divide_col': 'MMCode'}
divide_col 6 {'divide_col': 'MMCode'}
divide_col 11 {'divide_col': 'MMCode'}
divide_col 28 {'divide_col': 'MMCode'}
divide_col 3 {'divide_col': 'MMCode'}
divide_col 21 {'divide_col': 'MMCode'}
divide_col 16 {'divide_col': 'MMCode'}
divide_col 17 {'divide_col': 'MMCode'}
divide_col 36 {'divide_col': 'MMCode'}
divide_col 27 {'divide_col': 'MMCode'}
divide_col 26 {'divide_col': 'MMCode'}
divide_col 24 {'divide_col': 'MMCode'}
divide_col 2 {'divide_col': 'MMCode'}
divide_col 31 {'divide_col': 'MMCode'}
divide_col 14 {'divide_col': 'MMCode'}
divide_col 35 {'divide_col': 'MMCode'}
divide_col 39 {'divide_col': 'MMCode'}
divide_col 38 {'divide_col': 'MMCode'}
divide_col 25 {'divide_col': 'MMCode'}
divide_col 23 {'divide_col': 'MMCode'}
divide_col 19 {'divide_col': 'MMCode'}
divide_col 40 {'divide_col': 'MMCode'}
165 in yaml
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment