Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
C
concepts
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package registry
Model registry
Operate
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
meldb
concepts
Commits
4f78ba85
Commit
4f78ba85
authored
3 months ago
by
Jakub Dylag
Browse files
Options
Downloads
Patches
Plain Diff
Convertion script - allow multiple files per concept set
parent
62e98fc3
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
convert.ipynb
+272
-63
272 additions, 63 deletions
convert.ipynb
with
272 additions
and
63 deletions
convert.ipynb
+
272
−
63
View file @
4f78ba85
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
"cells": [
"cells": [
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
5
,
"execution_count":
1
,
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
"source": [
"source": [
...
@@ -14,110 +14,319 @@
...
@@ -14,110 +14,319 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
null
,
"execution_count":
37
,
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n",
"split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n",
"split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n",
"split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n",
"split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n",
"divide_col 13 {'divide_col': 'MMCode'}\n",
"divide_col 22 {'divide_col': 'MMCode'}\n",
"divide_col 5 {'divide_col': 'MMCode'}\n",
"divide_col 33 {'divide_col': 'MMCode'}\n",
"divide_col 37 {'divide_col': 'MMCode'}\n",
"divide_col 41 {'divide_col': 'MMCode'}\n",
"divide_col 34 {'divide_col': 'MMCode'}\n",
"divide_col 12 {'divide_col': 'MMCode'}\n",
"divide_col 6 {'divide_col': 'MMCode'}\n",
"divide_col 11 {'divide_col': 'MMCode'}\n",
"divide_col 28 {'divide_col': 'MMCode'}\n",
"divide_col 3 {'divide_col': 'MMCode'}\n",
"divide_col 21 {'divide_col': 'MMCode'}\n",
"divide_col 16 {'divide_col': 'MMCode'}\n",
"divide_col 17 {'divide_col': 'MMCode'}\n",
"divide_col 36 {'divide_col': 'MMCode'}\n",
"divide_col 27 {'divide_col': 'MMCode'}\n",
"divide_col 26 {'divide_col': 'MMCode'}\n",
"divide_col 24 {'divide_col': 'MMCode'}\n",
"divide_col 2 {'divide_col': 'MMCode'}\n",
"divide_col 31 {'divide_col': 'MMCode'}\n",
"divide_col 14 {'divide_col': 'MMCode'}\n",
"divide_col 35 {'divide_col': 'MMCode'}\n",
"divide_col 39 {'divide_col': 'MMCode'}\n",
"divide_col 38 {'divide_col': 'MMCode'}\n",
"divide_col 25 {'divide_col': 'MMCode'}\n",
"divide_col 23 {'divide_col': 'MMCode'}\n",
"divide_col 19 {'divide_col': 'MMCode'}\n",
"divide_col 40 {'divide_col': 'MMCode'}\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>files</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ALL_MEDICATIONS</td>\n",
" <td>{'path': 'Medication code source/WP02_SAIL_WIL...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>PAIN_MEDICATIONS_IF_NO_EPILEPSY_DIAGNOSIS</td>\n",
" <td>{'path': 'Medication code source/Pain medicati...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>PAIN_MEDICATIONS</td>\n",
" <td>{'path': 'Medication code source/Pain medicati...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ABDO_PAIN</td>\n",
" <td>{'path': 'ClinicalCodes.org from the Universit...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>CVD_EVENTS</td>\n",
" <td>{'path': 'ClinicalCodes.org from the Universit...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>481</th>\n",
" <td>SLEEP_PROBLEMS</td>\n",
" <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/SLEEP_...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>482</th>\n",
" <td>SWEATING</td>\n",
" <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/SWEATI...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>483</th>\n",
" <td>TIREDNESS</td>\n",
" <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/TIREDN...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>484</th>\n",
" <td>UNINTENTIONAL_WEIGHT_LOSS</td>\n",
" <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/UNINTE...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>485</th>\n",
" <td>URINARY_INCONTINENCE</td>\n",
" <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/URINAR...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>486 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" name \\\n",
"0 ALL_MEDICATIONS \n",
"1 PAIN_MEDICATIONS_IF_NO_EPILEPSY_DIAGNOSIS \n",
"2 PAIN_MEDICATIONS \n",
"3 ABDO_PAIN \n",
"4 CVD_EVENTS \n",
".. ... \n",
"481 SLEEP_PROBLEMS \n",
"482 SWEATING \n",
"483 TIREDNESS \n",
"484 UNINTENTIONAL_WEIGHT_LOSS \n",
"485 URINARY_INCONTINENCE \n",
"\n",
" files \n",
"0 {'path': 'Medication code source/WP02_SAIL_WIL... \n",
"1 {'path': 'Medication code source/Pain medicati... \n",
"2 {'path': 'Medication code source/Pain medicati... \n",
"3 {'path': 'ClinicalCodes.org from the Universit... \n",
"4 {'path': 'ClinicalCodes.org from the Universit... \n",
".. ... \n",
"481 {'path': 'NEW BURDEN CODELISTS/Symptoms/SLEEP_... \n",
"482 {'path': 'NEW BURDEN CODELISTS/Symptoms/SWEATI... \n",
"483 {'path': 'NEW BURDEN CODELISTS/Symptoms/TIREDN... \n",
"484 {'path': 'NEW BURDEN CODELISTS/Symptoms/UNINTE... \n",
"485 {'path': 'NEW BURDEN CODELISTS/Symptoms/URINAR... \n",
"\n",
"[486 rows x 2 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"165 in yaml\n"
]
}
],
"source": [
"source": [
"json_file = \"PHEN_assign_v3.json\"\n",
"json_file = \"PHEN_assign_v3.json\"\n",
"yaml_path = \"
workspace/phen/
config.yml\"\n",
"yaml_path = \"config.yml\"\n",
"source_folder_path = \"
workspace/phen/
concepts\"\n",
"source_folder_path = \"concepts\"\n",
"outs =
{}
\n",
"outs =
[]
\n",
"\n",
"\n",
"# Read the JSON file\n",
"# Read the JSON file\n",
"with open(json_file, 'r', encoding='utf-8') as file:\n",
"with open(json_file, 'r', encoding='utf-8') as file:\n",
" data = json.load(file)\n",
" data = json.load(file)\n",
"\n",
"\n",
"def add_conc(outs, name, path, columns, category=None, metadata=None):\n",
"def add_conc(outs, name, path, columns, category=None, actions=None, #metacol=None\n",
"):\n",
" #TODO: acmc handle empty conceptset when all QA fail \n",
" if name == \"PLASMACELL\":\n",
" if name == \"PLASMACELL\":\n",
" return outs\n",
" return outs\n",
"\n",
"\n",
" out = {\n",
" out = {\n",
" \"name\":str(name),\n",
" \"name\":str(name),\n",
" \"file\":{\n",
" \"file
s
\":{\n",
" \"path\":str(path),\n",
" \"path\":str(path)
.replace(\"\\\\\", '/')
,\n",
" \"columns\":columns,\n",
" \"columns\":columns,\n",
" },\n",
" },\n",
" }\n",
" }\n",
" if category is not None:\n",
" #divide_col\n",
" out[\"file\"][\"category\"]=str(category)\n",
" if (category is not None) and (actions is not None):\n",
" if metadata is not None:\n",
" print(\"divide_col\", category, actions)\n",
" out[\"metadata\"]=metadata\n",
" out[\"files\"][\"category\"]=str(category)\n",
" out[\"files\"][\"actions\"] = {}\n",
" out[\"files\"][\"actions\"][\"divide_col\"] = actions[\"divide_col\"]\n",
" #split_col\n",
" elif (actions is not None):\n",
" print(\"split_col\", actions)\n",
" out[\"files\"][\"actions\"] = {}\n",
" out[\"files\"][\"actions\"][\"split_col\"] = actions[\"split_col\"]\n",
" out[\"files\"][\"actions\"][\"codes_col\"] = actions[\"codes_col\"]\n",
"\n",
" # if metacol is not None:\n",
" # out[\"metacol\"]=metacol\n",
"\n",
"\n",
" outs.append(out)\n",
" outs.append(out)\n",
" return outs\n",
" return outs\n",
"\n",
"\n",
"outs = []\n",
"for folder in data[\"codes\"]:\n",
"for folder in data[\"codes\"]:\n",
" folder_path = folder[\"folder\"]\n",
" folder_path = folder[\"folder\"]\n",
" for file
s
in folder[\"files\"]:\n",
" for file in folder[\"files\"]:\n",
"\n",
"\n",
" #TODO: actions divide_col\n",
" #TODO: actions divide_col\n",
" #TODO: save metadata - has to be dict not list?\n",
"\n",
"\n",
" #Columns\n",
" #Columns\n",
" col_out = {}\n",
" col_out = {}\n",
" for k,v in file
s
[\"columns\"].items():\n",
" for k,v in file[\"columns\"].items():\n",
" supported = [\"read2\"]\n",
" supported = [\"read2\"
, \"read3\", \"icd10\", \"snomed\", \"opcs4\", \"atc\"
]\n",
" if type(v) == str and k[:-5] in supported:\n",
" if type(v) == str and k[:-5] in supported:\n",
" col_out[k[:-5]] = v\n",
" col_out[k[:-5]] = v\n",
"\n",
"\n",
" #Meta
data
\n",
" #Meta
column
\n",
" # if \"metadata\" in file
s
[\"columns\"]:\n",
" # if \"metadata\" in file[\"columns\"]:\n",
" # meta = dict(file
s
[\"columns\"][\"metadata\"])\n",
" # meta = dict(file[\"columns\"][\"metadata\"])\n",
" # else:\n",
" # else:\n",
" # meta = None\n",
" # meta = None\n",
"\n",
"\n",
" #File Path\n",
" #File Path\n",
" path = folder[\"folder\"][6:]+\"/\"+files[\"file\"]\n",
" new_folder_path = Path(folder[\"folder\"][6:].replace('\\\\','/'))\n",
" new_file_path = Path(file[\"file\"])\n",
" path = Path(new_folder_path / new_file_path)\n",
" \n",
" \n",
" if \"actions\" in files.keys():\n",
" pass\n",
" #split_col\n",
" # if \n",
" #divide_col\n",
" # elif \"concept_set_categories\" in files:\n",
" # for cat, name in files[\"concept_set_categories\"].items():\n",
" # print(col_out)\n",
" # outs = add_conc(\n",
" # outs,\n",
" # name = name,\n",
" # category = cat,\n",
" # path=path,\n",
" # columns = {\"read2\":\"Read Code\"}, #TODO: fix bodged\n",
" # metadata = {}\n",
" # )\n",
" elif \"excel_sheet\" in files.keys():\n",
" #Convert XLSX to CSV File\n",
" #Convert XLSX to CSV File\n",
" print(\"Converted Excel\", path)\n",
" if \"excel_sheet\" in file.keys():\n",
" df_xlsx = pd.read_excel(source_folder_path+\"/\"+path, sheet_name=files[\"excel_sheet\"])\n",
" # print(\"Converted Excel\", path)\n",
" path = Path(source_folder_path+\"/\"+path).with_suffix(\".csv\")\n",
" df_xlsx = pd.read_excel(Path(source_folder_path / path), sheet_name=file[\"excel_sheet\"])\n",
" df_xlsx.to_csv(path)\n",
" save_path = Path(source_folder_path / path).with_suffix(\".csv\")\n",
" path = Path(path).with_suffix(\".csv\")\n",
" # df_xlsx.to_csv(save_path) #TODO: uncomment\n",
"\n",
"\n",
" #Add multiple concept sets to yaml\n",
" if \"actions\" in file.keys():\n",
" for name in files[\"concept_set\"]: #If belongs to multiple\n",
" #divide_col\n",
" if \"concept_set_categories\" in file:\n",
" for cat, name in file[\"concept_set_categories\"].items():\n",
" outs = add_conc(\n",
" outs,\n",
" name = name[0],\n",
" category = cat,\n",
" actions = file[\"actions\"],\n",
" path=path,\n",
" columns = col_out, #TODO: fix bodged\n",
" # metacol = meta\n",
" )\n",
" #split_col\n",
" else:\n",
" for name in file[\"concept_set\"]: #If belongs to multiple\n",
" outs = add_conc(\n",
" outs = add_conc(\n",
" outs,\n",
" outs,\n",
" name=str(name),\n",
" name=str(name),\n",
" path=path,\n",
" path=path,\n",
" columns = col_out,\n",
" columns = col_out,\n",
"
metadata = {}
,\n",
"
actions=file[\"actions\"]
,\n",
" # meta
data
= meta\n",
"
# meta
col
= meta\n",
" ) \n",
" ) \n",
"\n",
"\n",
" elif \"concept_set\" in file
s
:\n",
" elif \"concept_set\" in file:\n",
" #Add multiple concept sets to yaml\n",
" #Add multiple concept sets to yaml\n",
" for name in file
s
[\"concept_set\"]: #If belongs to multiple\n",
" for name in file[\"concept_set\"]: #If belongs to multiple\n",
" outs = add_conc(\n",
" outs = add_conc(\n",
" outs,\n",
" outs,\n",
" name=str(name),\n",
" name=str(name),\n",
" path=path,\n",
" path=path,\n",
" columns = col_out,\n",
" columns = col_out,\n",
" metadata = {},\n",
" # metacol = meta\n",
" # metadata = meta\n",
" )\n",
" )\n",
"\n",
"\n",
"outs = pd.DataFrame(outs)\n",
"display(outs)\n",
"# print(len(outs.groupby(\"name\")), \"have files, out of\", len(data[\"concept_sets\"][\"concept_set\"]), \"defined\")\n",
"\n",
"final_out = []\n",
"for name, grp in outs.groupby(\"name\"):\n",
" out = {}\n",
" out[\"name\"]=name\n",
" \n",
" out[\"files\"]=list(grp[\"files\"] )\n",
" \n",
" for conc in data[\"concept_sets\"][\"concept_set\"]:\n",
" if conc[\"concept_set_name\"] == name:\n",
" metadata=conc[\"metadata\"]\n",
" break\n",
" out[\"metadata\"]=dict(metadata)\n",
" final_out.append(out)\n",
"\n",
"print(len(final_out), \"in yaml\")\n",
"\n",
"#Add Metadata for each concept set\n",
"# for conc in data[\"concept_sets\"][\"concept_set\"]: #iterate concept set definitions\n",
"# conc_name = conc[\"concept_set_name\"]\n",
"# metadata = conc[\"metadata\"]\n",
"\n",
"# #Look for matching concept set in output \n",
"# for c in outs:\n",
"# if c[\"name\"] == conc_name:\n",
"# c[\"metadata\"] = dict(metadata) #append metadata\n",
"\n",
"\n",
"#Remove \"PLASMACELL\" concept set\n",
"#Remove \"PLASMACELL\" concept set\n",
"outs = [(o) for o in outs if o[\"name\"] != \"PLASMACELL\"]\n",
"
#
outs = [(o) for o in outs if o[\"name\"] != \"PLASMACELL\"]\n",
"\n",
"\n",
"final = {\n",
"final = {\n",
" \"phenotype\":{\n",
" \"phenotype\":{\n",
...
@@ -128,7 +337,7 @@
...
@@ -128,7 +337,7 @@
" \"vocabulary_reference\": \"https://www.it-innovation.soton.ac.uk/projects/meldb\",\n",
" \"vocabulary_reference\": \"https://www.it-innovation.soton.ac.uk/projects/meldb\",\n",
" },\n",
" },\n",
" \"map\":[\"read2\", \"read3\", \"icd10\", \"snomed\", \"opcs4\", \"atc\"],\n",
" \"map\":[\"read2\", \"read3\", \"icd10\", \"snomed\", \"opcs4\", \"atc\"],\n",
" \"concept_sets\":out
s
,\n",
" \"concept_sets\":
final_
out,\n",
" },\n",
" },\n",
"}\n",
"}\n",
"\n",
"\n",
...
@@ -142,7 +351,7 @@
...
@@ -142,7 +351,7 @@
],
],
"metadata": {
"metadata": {
"kernelspec": {
"kernelspec": {
"display_name": "
.venv
",
"display_name": "
Python 3 (ipykernel)
",
"language": "python",
"language": "python",
"name": "python3"
"name": "python3"
},
},
...
@@ -156,9 +365,9 @@
...
@@ -156,9 +365,9 @@
"name": "python",
"name": "python",
"nbconvert_exporter": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"pygments_lexer": "ipython3",
"version": "3.12.
9
"
"version": "3.12.
4
"
}
}
},
},
"nbformat": 4,
"nbformat": 4,
"nbformat_minor":
2
"nbformat_minor":
4
}
}
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
python
```
python
import
yaml
import
yaml
import
json
import
json
from
pathlib
import
Path
from
pathlib
import
Path
import
pandas
as
pd
import
pandas
as
pd
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
python
```
python
json_file
=
"
PHEN_assign_v3.json
"
json_file
=
"
PHEN_assign_v3.json
"
yaml_path
=
"
workspace/phen/
config.yml
"
yaml_path
=
"
config.yml
"
source_folder_path
=
"
workspace/phen/
concepts
"
source_folder_path
=
"
concepts
"
outs
=
{}
outs
=
[]
# Read the JSON file
# Read the JSON file
with
open
(
json_file
,
'
r
'
,
encoding
=
'
utf-8
'
)
as
file
:
with
open
(
json_file
,
'
r
'
,
encoding
=
'
utf-8
'
)
as
file
:
data
=
json
.
load
(
file
)
data
=
json
.
load
(
file
)
def
add_conc
(
outs
,
name
,
path
,
columns
,
category
=
None
,
metadata
=
None
):
def
add_conc
(
outs
,
name
,
path
,
columns
,
category
=
None
,
actions
=
None
,
#metacol=None
):
#TODO: acmc handle empty conceptset when all QA fail
if
name
==
"
PLASMACELL
"
:
if
name
==
"
PLASMACELL
"
:
return
outs
return
outs
out
=
{
out
=
{
"
name
"
:
str
(
name
),
"
name
"
:
str
(
name
),
"
file
"
:{
"
file
s
"
:{
"
path
"
:
str
(
path
),
"
path
"
:
str
(
path
)
.
replace
(
"
\\
"
,
'
/
'
)
,
"
columns
"
:
columns
,
"
columns
"
:
columns
,
},
},
}
}
if
category
is
not
None
:
#divide_col
out
[
"
file
"
][
"
category
"
]
=
str
(
category
)
if
(
category
is
not
None
)
and
(
actions
is
not
None
):
if
metadata
is
not
None
:
print
(
"
divide_col
"
,
category
,
actions
)
out
[
"
metadata
"
]
=
metadata
out
[
"
files
"
][
"
category
"
]
=
str
(
category
)
out
[
"
files
"
][
"
actions
"
]
=
{}
out
[
"
files
"
][
"
actions
"
][
"
divide_col
"
]
=
actions
[
"
divide_col
"
]
#split_col
elif
(
actions
is
not
None
):
print
(
"
split_col
"
,
actions
)
out
[
"
files
"
][
"
actions
"
]
=
{}
out
[
"
files
"
][
"
actions
"
][
"
split_col
"
]
=
actions
[
"
split_col
"
]
out
[
"
files
"
][
"
actions
"
][
"
codes_col
"
]
=
actions
[
"
codes_col
"
]
# if metacol is not None:
# out["metacol"]=metacol
outs
.
append
(
out
)
outs
.
append
(
out
)
return
outs
return
outs
outs
=
[]
for
folder
in
data
[
"
codes
"
]:
for
folder
in
data
[
"
codes
"
]:
folder_path
=
folder
[
"
folder
"
]
folder_path
=
folder
[
"
folder
"
]
for
file
s
in
folder
[
"
files
"
]:
for
file
in
folder
[
"
files
"
]:
#TODO: actions divide_col
#TODO: actions divide_col
#TODO: save metadata - has to be dict not list?
#Columns
#Columns
col_out
=
{}
col_out
=
{}
for
k
,
v
in
file
s
[
"
columns
"
].
items
():
for
k
,
v
in
file
[
"
columns
"
].
items
():
supported
=
[
"
read2
"
]
supported
=
[
"
read2
"
,
"
read3
"
,
"
icd10
"
,
"
snomed
"
,
"
opcs4
"
,
"
atc
"
]
if
type
(
v
)
==
str
and
k
[:
-
5
]
in
supported
:
if
type
(
v
)
==
str
and
k
[:
-
5
]
in
supported
:
col_out
[
k
[:
-
5
]]
=
v
col_out
[
k
[:
-
5
]]
=
v
#Meta
data
#Meta
column
# if "metadata" in file
s
["columns"]:
# if "metadata" in file["columns"]:
# meta = dict(file
s
["columns"]["metadata"])
# meta = dict(file["columns"]["metadata"])
# else:
# else:
# meta = None
# meta = None
#File Path
#File Path
path
=
folder
[
"
folder
"
][
6
:]
+
"
/
"
+
files
[
"
file
"
]
new_folder_path
=
Path
(
folder
[
"
folder
"
][
6
:].
replace
(
'
\\
'
,
'
/
'
))
new_file_path
=
Path
(
file
[
"
file
"
])
path
=
Path
(
new_folder_path
/
new_file_path
)
#Convert XLSX to CSV File
if
"
excel_sheet
"
in
file
.
keys
():
# print("Converted Excel", path)
df_xlsx
=
pd
.
read_excel
(
Path
(
source_folder_path
/
path
),
sheet_name
=
file
[
"
excel_sheet
"
])
save_path
=
Path
(
source_folder_path
/
path
).
with_suffix
(
"
.csv
"
)
path
=
Path
(
path
).
with_suffix
(
"
.csv
"
)
# df_xlsx.to_csv(save_path) #TODO: uncomment
if
"
actions
"
in
files
.
keys
():
if
"
actions
"
in
file
.
keys
():
pass
#split_col
# if
#divide_col
#divide_col
# elif "concept_set_categories" in files:
if
"
concept_set_categories
"
in
file
:
# for cat, name in files["concept_set_categories"].items():
for
cat
,
name
in
file
[
"
concept_set_categories
"
].
items
():
# print(col_out)
outs
=
add_conc
(
# outs = add_conc(
outs
,
# outs,
name
=
name
[
0
],
# name = name,
category
=
cat
,
# category = cat,
actions
=
file
[
"
actions
"
],
# path=path,
path
=
path
,
# columns = {"read2":"Read Code"}, #TODO: fix bodged
columns
=
col_out
,
#TODO: fix bodged
# metadata = {}
# metacol = meta
# )
)
elif
"
excel_sheet
"
in
files
.
keys
():
#split_col
#Convert XLSX to CSV File
else
:
print
(
"
Converted Excel
"
,
path
)
for
name
in
file
[
"
concept_set
"
]:
#If belongs to multiple
df_xlsx
=
pd
.
read_excel
(
source_folder_path
+
"
/
"
+
path
,
sheet_name
=
files
[
"
excel_sheet
"
])
outs
=
add_conc
(
path
=
Path
(
source_folder_path
+
"
/
"
+
path
).
with_suffix
(
"
.csv
"
)
outs
,
df_xlsx
.
to_csv
(
path
)
name
=
str
(
name
),
path
=
path
,
columns
=
col_out
,
actions
=
file
[
"
actions
"
],
# metacol = meta
)
elif
"
concept_set
"
in
file
:
#Add multiple concept sets to yaml
#Add multiple concept sets to yaml
for
name
in
file
s
[
"
concept_set
"
]:
#If belongs to multiple
for
name
in
file
[
"
concept_set
"
]:
#If belongs to multiple
outs
=
add_conc
(
outs
=
add_conc
(
outs
,
outs
,
name
=
str
(
name
),
name
=
str
(
name
),
path
=
path
,
path
=
path
,
columns
=
col_out
,
columns
=
col_out
,
metadata
=
{},
# metacol = meta
# metadata = meta
)
)
elif
"
concept_set
"
in
files
:
outs
=
pd
.
DataFrame
(
outs
)
#Add multiple concept sets to yaml
display
(
outs
)
for
name
in
files
[
"
concept_set
"
]:
#If belongs to multiple
# print(len(outs.groupby("name")), "have files, out of", len(data["concept_sets"]["concept_set"]), "defined")
outs
=
add_conc
(
outs
,
final_out
=
[]
name
=
str
(
name
),
for
name
,
grp
in
outs
.
groupby
(
"
name
"
):
path
=
path
,
out
=
{}
columns
=
col_out
,
out
[
"
name
"
]
=
name
metadata
=
{},
# metadata = meta
out
[
"
files
"
]
=
list
(
grp
[
"
files
"
]
)
)
for
conc
in
data
[
"
concept_sets
"
][
"
concept_set
"
]:
if
conc
[
"
concept_set_name
"
]
==
name
:
metadata
=
conc
[
"
metadata
"
]
break
out
[
"
metadata
"
]
=
dict
(
metadata
)
final_out
.
append
(
out
)
print
(
len
(
final_out
),
"
in yaml
"
)
#Add Metadata for each concept set
# for conc in data["concept_sets"]["concept_set"]: #iterate concept set definitions
# conc_name = conc["concept_set_name"]
# metadata = conc["metadata"]
# #Look for matching concept set in output
# for c in outs:
# if c["name"] == conc_name:
# c["metadata"] = dict(metadata) #append metadata
#Remove "PLASMACELL" concept set
#Remove "PLASMACELL" concept set
outs
=
[(
o
)
for
o
in
outs
if
o
[
"
name
"
]
!=
"
PLASMACELL
"
]
#
outs = [(o) for o in outs if o["name"] != "PLASMACELL"]
final
=
{
final
=
{
"
phenotype
"
:{
"
phenotype
"
:{
"
version
"
:
"
4.0.0
"
,
"
version
"
:
"
4.0.0
"
,
"
omop
"
:{
"
omop
"
:{
"
vocabulary_id
"
:
"
MELDB_SAIL
"
,
"
vocabulary_id
"
:
"
MELDB_SAIL
"
,
"
vocabulary_name
"
:
"
Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity
"
,
"
vocabulary_name
"
:
"
Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity
"
,
"
vocabulary_reference
"
:
"
https://www.it-innovation.soton.ac.uk/projects/meldb
"
,
"
vocabulary_reference
"
:
"
https://www.it-innovation.soton.ac.uk/projects/meldb
"
,
},
},
"
map
"
:[
"
read2
"
,
"
read3
"
,
"
icd10
"
,
"
snomed
"
,
"
opcs4
"
,
"
atc
"
],
"
map
"
:[
"
read2
"
,
"
read3
"
,
"
icd10
"
,
"
snomed
"
,
"
opcs4
"
,
"
atc
"
],
"
concept_sets
"
:
out
s
,
"
concept_sets
"
:
final_
out
,
},
},
}
}
yaml
.
Dumper
.
ignore_aliases
=
lambda
*
args
:
True
#remove unwanted pointers
yaml
.
Dumper
.
ignore_aliases
=
lambda
*
args
:
True
#remove unwanted pointers
# Convert and write to YAML
# Convert and write to YAML
with
open
(
yaml_path
,
'
w
'
,
encoding
=
'
utf-8
'
)
as
file
:
with
open
(
yaml_path
,
'
w
'
,
encoding
=
'
utf-8
'
)
as
file
:
yaml
.
dump
(
dict
(
final
),
file
,
default_flow_style
=
False
,
allow_unicode
=
True
)
yaml
.
dump
(
dict
(
final
),
file
,
default_flow_style
=
False
,
allow_unicode
=
True
)
```
```
%% Output
split_col {'split_col': 'coding_system', 'codes_col': 'code'}
split_col {'split_col': 'coding_system', 'codes_col': 'code'}
split_col {'split_col': 'coding_system', 'codes_col': 'code'}
split_col {'split_col': 'coding_system', 'codes_col': 'code'}
split_col {'split_col': 'coding_system', 'codes_col': 'code'}
divide_col 13 {'divide_col': 'MMCode'}
divide_col 22 {'divide_col': 'MMCode'}
divide_col 5 {'divide_col': 'MMCode'}
divide_col 33 {'divide_col': 'MMCode'}
divide_col 37 {'divide_col': 'MMCode'}
divide_col 41 {'divide_col': 'MMCode'}
divide_col 34 {'divide_col': 'MMCode'}
divide_col 12 {'divide_col': 'MMCode'}
divide_col 6 {'divide_col': 'MMCode'}
divide_col 11 {'divide_col': 'MMCode'}
divide_col 28 {'divide_col': 'MMCode'}
divide_col 3 {'divide_col': 'MMCode'}
divide_col 21 {'divide_col': 'MMCode'}
divide_col 16 {'divide_col': 'MMCode'}
divide_col 17 {'divide_col': 'MMCode'}
divide_col 36 {'divide_col': 'MMCode'}
divide_col 27 {'divide_col': 'MMCode'}
divide_col 26 {'divide_col': 'MMCode'}
divide_col 24 {'divide_col': 'MMCode'}
divide_col 2 {'divide_col': 'MMCode'}
divide_col 31 {'divide_col': 'MMCode'}
divide_col 14 {'divide_col': 'MMCode'}
divide_col 35 {'divide_col': 'MMCode'}
divide_col 39 {'divide_col': 'MMCode'}
divide_col 38 {'divide_col': 'MMCode'}
divide_col 25 {'divide_col': 'MMCode'}
divide_col 23 {'divide_col': 'MMCode'}
divide_col 19 {'divide_col': 'MMCode'}
divide_col 40 {'divide_col': 'MMCode'}
165 in yaml
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment