Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
C
concepts-processing
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Package registry
Operate
Terraform modules
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
meldb
concepts-processing
Commits
8f8b5119
Commit
8f8b5119
authored
5 months ago
by
Jakub Dylag
Browse files
Options
Downloads
Patches
Plain Diff
move summary excel into json config
parent
28f44a19
No related branches found
No related tags found
No related merge requests found
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
main.py
+7
-16
7 additions, 16 deletions
main.py
process_codes_WP.ipynb
+610
-596
610 additions, 596 deletions
process_codes_WP.ipynb
with
617 additions
and
612 deletions
main.py
+
7
−
16
View file @
8f8b5119
...
@@ -255,23 +255,14 @@ def run_all(mapping_file, target_code_type,
...
@@ -255,23 +255,14 @@ def run_all(mapping_file, target_code_type,
out
=
out
.
drop_duplicates
(
subset
=
[
"
CONCEPT_SET
"
,
"
CONCEPT
"
])
out
=
out
.
drop_duplicates
(
subset
=
[
"
CONCEPT_SET
"
,
"
CONCEPT
"
])
out
=
out
.
sort_values
(
by
=
[
"
CONCEPT_SET
"
,
"
CONCEPT
"
])
out
=
out
.
sort_values
(
by
=
[
"
CONCEPT_SET
"
,
"
CONCEPT
"
])
#Merge with Concept Types in Summary Excel File
#Add Concept Set Defintions metadata
if
"
excel_sheet
"
in
summary_config
:
summary_df
=
pd
.
DataFrame
(
summary_config
[
"
concept_set
"
])
#transform to dataframe
summary_df
=
read_table_file
(
summary_config
[
"
file
"
],
excel_sheet
=
summary_config
[
"
excel_sheet
"
])
if
"
metadata
"
in
summary_df
.
columns
:
else
:
summary_df
=
summary_df
.
join
(
pd
.
json_normalize
(
summary_df
[
"
metadata
"
]))
#metadata to columns
summary_df
=
read_table_file
(
summary_config
[
"
file
"
])
summary_df
=
summary_df
.
drop
(
columns
=
[
"
metadata
"
])
summary_cols_all
=
[]
#get all column names
summary_df
=
summary_df
.
rename
(
columns
=
{
"
concept_set_name
"
:
"
CONCEPT_SET
"
})
for
v
in
summary_config
[
"
columns
"
].
values
():
#TODO: put in seperate function - get all columns in JSON file object
if
type
(
v
)
==
str
:
summary_cols_all
.
append
(
v
)
else
:
summary_cols_all
+=
v
output_version
=
summary_config
[
"
version
"
]
summary_df
=
summary_df
[
summary_cols_all
]
#select all relevant columns
summary_df
=
summary_df
.
rename
(
columns
=
{
summary_config
[
"
columns
"
][
"
concept_set_name
"
]:
"
CONCEPT_SET
"
})
summary_df
=
summary_df
.
drop_duplicates
()
#remove duplicates
summary_df
=
summary_df
.
drop_duplicates
()
#remove duplicates
out
=
out
.
merge
(
summary_df
,
how
=
"
left
"
,
on
=
'
CONCEPT_SET
'
)
out
=
out
.
merge
(
summary_df
,
how
=
"
left
"
,
on
=
'
CONCEPT_SET
'
)
#merge with output
# Save Output File
# Save Output File
print
(
bcolors
.
HEADER
,
"
---
"
*
5
,
"
OUTPUT
"
,
"
---
"
*
5
,
bcolors
.
ENDC
)
print
(
bcolors
.
HEADER
,
"
---
"
*
5
,
"
OUTPUT
"
,
"
---
"
*
5
,
bcolors
.
ENDC
)
...
...
This diff is collapsed.
Click to expand it.
process_codes_WP.ipynb
+
610
−
596
View file @
8f8b5119
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
"cells": [
"cells": [
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
2
,
"execution_count":
null
,
"id": "8c8f4cdf-04a5-4762-895e-6555781a1f05",
"id": "8c8f4cdf-04a5-4762-895e-6555781a1f05",
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
...
@@ -113,163 +113,18 @@
...
@@ -113,163 +113,18 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
64
,
"execution_count":
null
,
"id": "f155b635-b459-4aff-81b2-e065fc223858",
"id": "f155b635-b459-4aff-81b2-e065fc223858",
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [],
{
"data": {
"text/plain": [
"0 False\n",
"dtype: bool"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
"source": []
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
94
,
"execution_count":
null
,
"id": "d040eda5-4028-4047-834c-7315e307e415",
"id": "d040eda5-4028-4047-834c-7315e307e415",
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [],
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>icd10_code</th>\n",
" <th>icd10_alt_code</th>\n",
" <th>description</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>A00</td>\n",
" <td>A00</td>\n",
" <td>Cholera</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>A00.0</td>\n",
" <td>A000</td>\n",
" <td>Cholera due to Vibrio cholerae 01, biovar chol...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>A00.1</td>\n",
" <td>A001</td>\n",
" <td>Cholera due to Vibrio cholerae 01, biovar eltor</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>A00.9</td>\n",
" <td>A009</td>\n",
" <td>Cholera, unspecified</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>A01</td>\n",
" <td>A01</td>\n",
" <td>Typhoid and paratyphoid fevers</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17929</th>\n",
" <td>U84.3</td>\n",
" <td>U843</td>\n",
" <td>Resistance to tuberculostatic drug(s)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17930</th>\n",
" <td>U84.7</td>\n",
" <td>U847</td>\n",
" <td>Resistance to multiple antimicrobial drugs</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17931</th>\n",
" <td>U84.8</td>\n",
" <td>U848</td>\n",
" <td>Resistance to other specified antimicrobial drug</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17932</th>\n",
" <td>U84.9</td>\n",
" <td>U849</td>\n",
" <td>Resistance to unspecified antimicrobial drugs</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17933</th>\n",
" <td>U85</td>\n",
" <td>U85X</td>\n",
" <td>Resistance to antineoplastic drugs</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>17934 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" icd10_code icd10_alt_code \\\n",
"0 A00 A00 \n",
"1 A00.0 A000 \n",
"2 A00.1 A001 \n",
"3 A00.9 A009 \n",
"4 A01 A01 \n",
"... ... ... \n",
"17929 U84.3 U843 \n",
"17930 U84.7 U847 \n",
"17931 U84.8 U848 \n",
"17932 U84.9 U849 \n",
"17933 U85 U85X \n",
"\n",
" description \n",
"0 Cholera \n",
"1 Cholera due to Vibrio cholerae 01, biovar chol... \n",
"2 Cholera due to Vibrio cholerae 01, biovar eltor \n",
"3 Cholera, unspecified \n",
"4 Typhoid and paratyphoid fevers \n",
"... ... \n",
"17929 Resistance to tuberculostatic drug(s) \n",
"17930 Resistance to multiple antimicrobial drugs \n",
"17931 Resistance to other specified antimicrobial drug \n",
"17932 Resistance to unspecified antimicrobial drugs \n",
"17933 Resistance to antineoplastic drugs \n",
"\n",
"[17934 rows x 3 columns]"
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"source": [
"df = pd.read_parquet(\"maps/processed/icd10_code.parquet\")\n",
"df = pd.read_parquet(\"maps/processed/icd10_code.parquet\")\n",
"df\n"
"df\n"
...
@@ -277,35 +132,10 @@
...
@@ -277,35 +132,10 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
113
,
"execution_count":
null
,
"id": "e0228ac9-8852-4818-b7f0-98429ca5229c",
"id": "e0228ac9-8852-4818-b7f0-98429ca5229c",
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 True\n",
"1 False\n",
"dtype: bool\n",
"0 False\n",
"1 False\n",
"dtype: bool\n"
]
},
{
"data": {
"text/plain": [
"0 True\n",
"1 False\n",
"dtype: bool"
]
},
"execution_count": 113,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"source": [
"code = [\"A00.0\", \"*00.0\"]\n",
"code = [\"A00.0\", \"*00.0\"]\n",
"code = pd.Series(code)\n",
"code = pd.Series(code)\n",
...
@@ -332,29 +162,35 @@
...
@@ -332,29 +162,35 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
119
,
"execution_count":
null
,
"id": "
4c9b6b3f-08aa-4f61-b9b2-44a24b5d00a0
",
"id": "
85dc197b-451e-4fa9-a53b-e6770c132123
",
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [],
{
"source": [
"name": "stdout",
"import json\n",
"output_type": "stream",
"import os\n",
"text": [
"\n",
"ALL FILES 878 878\n",
"path_json = \"../concepts/PHEN_assign_v3.json\"\n",
"JSON CONCEPTS 436 397\n",
"\n",
"EXCEL CONCEPTS 440 397\n",
"#Load JSON Concept Definitions\n",
"1755 878\n"
"mapping = json.load(open(path_json,'rb'))\n",
"summary_config = mapping[\"concept_sets\"][\"concept_set\"]\n",
"summary_df = pd.DataFrame(summary_config) #change to dataframe\n",
"\n",
"summary_df = summary_df.join(pd.json_normalize(summary_df[\"metadata\"])) #metadata to columns\n",
"summary_df = summary_df.drop(columns=[\"metadata\"])\n",
"summary_df = summary_df.rename(columns={\"concept_set_name\":\"CONCEPT_SET\"})\n",
"summary_df = summary_df.drop_duplicates() #remove duplicates\n",
" \n",
"summary_df\n"
]
]
},
},
{
{
"name": "stderr",
"cell_type": "code",
"output_type": "stream",
"execution_count": null,
"text": [
"id": "4c9b6b3f-08aa-4f61-b9b2-44a24b5d00a0",
"/opt/conda/lib/python3.9/site-packages/openpyxl/worksheet/_reader.py:329: UserWarning: Data Validation extension is not supported and will be removed\n",
"metadata": {},
" warn(msg)\n"
"outputs": [],
]
}
],
"source": [
"source": [
"import json\n",
"import json\n",
"import os\n",
"import os\n",
...
@@ -438,111 +274,10 @@
...
@@ -438,111 +274,10 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
120
,
"execution_count":
null
,
"id": "f8e70c33-c869-46f8-953e-f6b52992cfbb",
"id": "f8e70c33-c869-46f8-953e-f6b52992cfbb",
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [],
{
"data": {
"text/plain": [
"'JSON MISSING'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filepath</th>\n",
" <th>json_concept</th>\n",
" <th>json_code_types</th>\n",
" <th>excel_concept</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [filepath, json_concept, json_code_types, excel_concept]\n",
"Index: []"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'EXCEL MISSING'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filepath</th>\n",
" <th>json_concept</th>\n",
" <th>json_code_types</th>\n",
" <th>excel_concept</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [filepath, json_concept, json_code_types, excel_concept]\n",
"Index: []"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"source": [
"display(\"JSON MISSING\", outs[outs[\"json_concept\"].isna() & outs[\"excel_concept\"].notna()])\n",
"display(\"JSON MISSING\", outs[outs[\"json_concept\"].isna() & outs[\"excel_concept\"].notna()])\n",
"display(\"EXCEL MISSING\", outs[outs[\"json_concept\"].notna() & outs[\"excel_concept\"].isna()])"
"display(\"EXCEL MISSING\", outs[outs[\"json_concept\"].notna() & outs[\"excel_concept\"].isna()])"
...
@@ -550,7 +285,7 @@
...
@@ -550,7 +285,7 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
118
,
"execution_count":
null
,
"id": "9d84465f-f064-4df2-b0e4-2dfb217aea21",
"id": "9d84465f-f064-4df2-b0e4-2dfb217aea21",
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
...
@@ -567,21 +302,10 @@
...
@@ -567,21 +302,10 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
10
,
"execution_count":
null
,
"id": "7f7fc771-e406-42c7-8a09-16a20b5298f5",
"id": "7f7fc771-e406-42c7-8a09-16a20b5298f5",
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [],
{
"data": {
"text/plain": [
"65307"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"source": [
"total_length = 0\n",
"total_length = 0\n",
"for file in all_files[\"filepath\"]:\n",
"for file in all_files[\"filepath\"]:\n",
...
@@ -620,6 +344,7 @@
...
@@ -620,6 +344,7 @@
"cell_type": "markdown",
"cell_type": "markdown",
"id": "357bb84c-90c2-4b5f-95c0-443191783a7f",
"id": "357bb84c-90c2-4b5f-95c0-443191783a7f",
"metadata": {
"metadata": {
"jp-MarkdownHeadingCollapsed": true,
"tags": []
"tags": []
},
},
"source": [
"source": [
...
@@ -628,48 +353,10 @@
...
@@ -628,48 +353,10 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
4
,
"execution_count":
null
,
"id": "7d3f9cb7-be86-4f1f-92f6-991094eb7bb7",
"id": "7d3f9cb7-be86-4f1f-92f6-991094eb7bb7",
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"--------- output/V2_2_2_MELD_concepts_readv2.csv ---------\n",
"MELDB missing concepts 0\n",
"Chars present: ['.' '0' '1' '2' '3' '4' '5' '6' '7' '8' '9' 'A' 'B' 'C' 'D' 'E' 'F' 'G'\n",
" 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' 'X' 'Y'\n",
" 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q'\n",
" 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']\n",
"--------- output/V2_2_2_MELD_snomed_no_translate.csv ---------\n",
"MELDB missing concepts 0\n",
"--------- output/V2_2_2_MELD_icd10_no_translate.csv ---------\n",
"MELDB missing concepts 0\n",
"Chars present: ['0' '1' '2' '3' '4' '5' '6' '7' '8' '9' 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H'\n",
" 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'T' 'W' 'X' 'Y' 'Z']\n",
"--------- output/V2_2_2_MELD_atc_no_translate.csv ---------\n",
"MELDB missing concepts 0\n",
"Chars present: ['0' '1' '2' '3' '6' 'A' 'F' 'N' 'X']\n",
"--------- output/V2_2_2_MELD_errors.csv ---------\n"
]
},
{
"data": {
"text/plain": [
"CODE_TYPE\n",
"snomed_code 1261\n",
"read2_code 464\n",
"read3_code 80\n",
"icd10_code 1\n",
"Name: count, dtype: int64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"source": [
"version = \"V2_2_2\"\n",
"version = \"V2_2_2\"\n",
"output_files = [f\"output/{version}_MELD_concepts_readv2.csv\",\n",
"output_files = [f\"output/{version}_MELD_concepts_readv2.csv\",\n",
...
@@ -700,141 +387,10 @@
...
@@ -700,141 +387,10 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
209
,
"execution_count":
null
,
"id": "08e0ecc1-9271-48c3-9c5b-094800072906",
"id": "08e0ecc1-9271-48c3-9c5b-094800072906",
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"## Compare Concepts V2_1_4 to V2_2_3\n",
"output/V2_1_4_MELD_concepts_readv2.csv output/V2_2_3_MELD_concepts_readv2.csv\n",
"- Removed Concepts ['THYROID_DISEASE', 'SCHIZOPHRENIA_BIPOLAR_DISORDER', 'PSIORIASIS_ECZEMA', 'HAEMATOLOGICAL_CANCERS', 'INFLAMM_ARTHROPATHIES', 'ALL_CANCER', 'STROKE_TIA', 'DIABETES', 'PMR_AND_GCD', 'LONG_TERM_MS_PROBLEMS', 'ALL_CKD', 'INFLAMM_ARTHROPATHIES_CONNECTIVE_TISSUE_DIS', 'RENAL_TRANSPLANT_DIALYSIS']\n",
"- Added Concepts []\n",
"- Changed Concepts \n",
"\t - ANXIETY -7.0\n",
"\t - ARRHYTHMIA -1.0\n",
"\t - ASTHMA -1.0\n",
"\t - AUTISM_AND_ADHD -4.0\n",
"\t - BIPOLAR_DISORDER -1.0\n",
"\t - BLINDNESS_AND_LOW_VISION -3.0\n",
"\t - COELIAC_DISEASE -1.0\n",
"\t - CORONARY_HEART_DISEASE -8.0\n",
"\t - DEAFNESS -33.0\n",
"\t - DEMENTIA_ALZHEIMER -2.0\n",
"\t - DEPRESSION -5.0\n",
"\t - DIABETES_T1 -1.0\n",
"\t - DIABETES_T2 -1.0\n",
"\t - DIALYSIS -14.0\n",
"\t - DIVERTICULAR_DISEASE -11.0\n",
"\t - DRUG_ALCOHOL_MISUSE -3.0\n",
"\t - EATING_DISORDERS -2.0\n",
"\t - EPILEPSY -1.0\n",
"\t - FATIGUE -27.0\n",
"\t - HEADACHE -48.0\n",
"\t - HF -3.0\n",
"\t - INCONTINENCE -21.0\n",
"\t - LEARNING_DISABILITY -3.0\n",
"\t - MSK_PAIN -36.0\n",
"\t - MULTIPLE_SCLEROSIS -1.0\n",
"\t - PALLIATIVE_CARE -8.0\n",
"\t - PLASMACELL -1.0\n",
"\t - PTSD -1.0\n",
"\t - SCHIZOPHRENIA -1.0\n",
"\t - SELF_HARM -37.0\n",
"\t - SLEEP_PROBLEMS -74.0\n",
"\t - STRESS -31.0\n",
"\t - SYSTEMIC_LUPUS_ERYTHEMATOSUS -2.0\n",
"\n",
"output/V2_1_4_MELD_snomed_no_translate.csv output/V2_2_3_MELD_snomed_no_translate.csv\n",
"- Removed Concepts ['THYROID_DISEASE', 'SCHIZOPHRENIA_BIPOLAR_DISORDER', 'PSIORIASIS_ECZEMA', 'HAEMATOLOGICAL_CANCERS', 'INFLAMM_ARTHROPATHIES', 'ALL_CANCER', 'STROKE_TIA', 'DIABETES', 'PMR_AND_GCD', 'LONG_TERM_MS_PROBLEMS', 'ALL_CKD', 'INFLAMM_ARTHROPATHIES_CONNECTIVE_TISSUE_DIS', 'RENAL_TRANSPLANT_DIALYSIS']\n",
"- Added Concepts []\n",
"- Changed Concepts \n",
"\t - ANAEMIA -2.0\n",
"\t - ANEURYSM -3.0\n",
"\t - ANXIETY -7.0\n",
"\t - ARRHYTHMIA -25.0\n",
"\t - ASTHMA -34.0\n",
"\t - ATOPIC_ECZEMA -6.0\n",
"\t - AUTISM_AND_ADHD -2.0\n",
"\t - BIPOLAR_DISORDER -3.0\n",
"\t - BLINDNESS_AND_LOW_VISION -4.0\n",
"\t - BREAST_CANCER -2.0\n",
"\t - BRONCHIECSTASIS -1.0\n",
"\t - CHRONIC_BACK_PAIN -1.0\n",
"\t - CHRONIC_FATIGUE_SYNDROME -3.0\n",
"\t - CHRONIC_LIVER_DISEASE -14.0\n",
"\t - CHRONIC_PAIN -2.0\n",
"\t - CKD_STAGE3_5 -3.0\n",
"\t - COELIAC_DISEASE -6.0\n",
"\t - COLON_CANCER -6.0\n",
"\t - CONGENITAL_DIS_CHROMOSOMAL_ABNORMALITIES -1.0\n",
"\t - COPD -31.0\n",
"\t - CORONARY_HEART_DISEASE -21.0\n",
"\t - CYSTIC_FIBROSIS -24.0\n",
"\t - DEAFNESS -15.0\n",
"\t - DEMENTIA_ALZHEIMER -111.0\n",
"\t - DEPRESSION -34.0\n",
"\t - DIABETES_T2 -2.0\n",
"\t - DIABETIC_RETINOPATHY -13.0\n",
"\t - DIALYSIS -1.0\n",
"\t - DIVERTICULAR_DISEASE -4.0\n",
"\t - DRUG_ALCOHOL_MISUSE -310.0\n",
"\t - EATING_DISORDERS -4.0\n",
"\t - ENDOMETRIOSIS -1.0\n",
"\t - EPILEPSY -11.0\n",
"\t - GLAUCOMA -3.0\n",
"\t - GOUT -4.0\n",
"\t - HEART_VALVE_DISORDERS -6.0\n",
"\t - HF -4.0\n",
"\t - HIVAIDS -18.0\n",
"\t - HYPERTENSION -11.0\n",
"\t - HYPERTHYROIDISM -1.0\n",
"\t - HYPOTHYROIDISM -8.0\n",
"\t - IBD -2.0\n",
"\t - ILD -2.0\n",
"\t - LEARNING_DISABILITY -40.0\n",
"\t - LEUKAEMIA -1.0\n",
"\t - LYMPHOMA -2.0\n",
"\t - MENIERES_DISEASE -1.0\n",
"\t - METASTATIC_CANCER -3.0\n",
"\t - MOBILITY_PROBLEMS -45.0\n",
"\t - MULTIPLE_SCLEROSIS -13.0\n",
"\t - OBESITY -63.0\n",
"\t - OSTEOARTHRITIS -3.0\n",
"\t - OSTEOPOROSIS -4.0\n",
"\t - PARALYSIS -3.0\n",
"\t - PARKINSONS -2.0\n",
"\t - PLASMACELL -1.0\n",
"\t - PROSTATE_CANCER -2.0\n",
"\t - PROSTATE_DISORDERS -2.0\n",
"\t - PSORIASIS -3.0\n",
"\t - PTSD -38.0\n",
"\t - RENAL_TRANSPLANT -1.0\n",
"\t - RHEUMATOID_ARTHRITIS -8.0\n",
"\t - SCHIZOPHRENIA -85.0\n",
"\t - SKIN_CANCER -4.0\n",
"\t - STROKE -4.0\n",
"\t - SYSTEMIC_LUPUS_ERYTHEMATOSUS -1.0\n",
"\t - TIA -1.0\n",
"\t - VIRAL_HEPATITIS -9.0\n",
"\t - VTD -5.0\n",
"\n",
"output/V2_1_4_MELD_icd10_no_translate.csv output/V2_2_3_MELD_icd10_no_translate.csv\n",
"- Removed Concepts ['THYROID_DISEASE', 'SCHIZOPHRENIA_BIPOLAR_DISORDER', 'PSIORIASIS_ECZEMA', 'HAEMATOLOGICAL_CANCERS', 'INFLAMM_ARTHROPATHIES', 'ALL_CANCER', 'STROKE_TIA', 'DIABETES', 'PMR_AND_GCD', 'LONG_TERM_MS_PROBLEMS', 'ALL_CKD', 'INFLAMM_ARTHROPATHIES_CONNECTIVE_TISSUE_DIS']\n",
"- Added Concepts []\n",
"- Changed Concepts \n",
"\t - CVD_EVENTS -1.0\n",
"\n",
"output/V2_1_4_MELD_atc_no_translate.csv output/V2_2_3_MELD_atc_no_translate.csv\n",
"- Removed Concepts []\n",
"- Added Concepts []\n",
"- Changed Concepts \n",
"\n"
]
}
],
"source": [
"source": [
"def get_output_files(version):\n",
"def get_output_files(version):\n",
" output_files = [f\"output/{version}_MELD_concepts_readv2.csv\",\n",
" output_files = [f\"output/{version}_MELD_concepts_readv2.csv\",\n",
...
@@ -884,6 +440,33 @@
...
@@ -884,6 +440,33 @@
" "
" "
]
]
},
},
{
"cell_type": "code",
"execution_count": null,
"id": "cc60c137-5a85-4155-af6b-6796f8c05980",
"metadata": {},
"outputs": [],
"source": [
"import glob\n",
"import os\n",
"import pandas as pd\n",
"\n",
"df = pd.read_csv(\"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/PHEN_summary_working.csv\")\n",
"df = df.set_index(\"#\")\n",
"\n",
"for vocab in [\"atc\", \"icd10\", \"readv2\", \"snomed\"]:\n",
" df[vocab.upper()] = \"\"\n",
"\n",
" for file in glob.glob(f\"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/{vocab}/*.csv\"):\n",
" concept_set = os.path.basename(file)[:-4]\n",
" row_index = df[df[\"CONCEPT NAME \"] == concept_set].index[0]\n",
"\n",
" df.loc[row_index, vocab.upper()] = \"YES\"\n",
"\n",
"df = df.drop(columns=[\"READv2_CODE\", \"ICD10_CODE\"])\n",
"df.to_csv(\"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/PHEN_summary_working_labelled.csv\")"
]
},
{
{
"cell_type": "markdown",
"cell_type": "markdown",
"id": "e5c4291f-847b-4c82-976e-bd5b3a7b6bcc",
"id": "e5c4291f-847b-4c82-976e-bd5b3a7b6bcc",
...
@@ -1095,7 +678,7 @@
...
@@ -1095,7 +678,7 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
170
,
"execution_count":
null
,
"id": "a968ffb1-4337-456b-8d20-419888b4044f",
"id": "a968ffb1-4337-456b-8d20-419888b4044f",
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
...
@@ -1112,7 +695,7 @@
...
@@ -1112,7 +695,7 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
184
,
"execution_count":
null
,
"id": "c70b1ce2-0f41-4d02-ad17-6fc44bc3c6bf",
"id": "c70b1ce2-0f41-4d02-ad17-6fc44bc3c6bf",
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
...
@@ -1125,127 +708,558 @@
...
@@ -1125,127 +708,558 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
194
,
"execution_count":
null
,
"id": "d5d34237-02d4-4dea-8c20-5adaf337f6b5",
"id": "d5d34237-02d4-4dea-8c20-5adaf337f6b5",
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [],
"source": [
"df1.merge(df2, how='inner', on='snomed_code')"
]
},
{
{
"data": {
"cell_type": "code",
"text/html": [
"execution_count": null,
"<div>\n",
"id": "b3166cf0-e4a5-43e0-aeac-78827427422e",
"<style scoped>\n",
"metadata": {},
" .dataframe tbody tr th:only-of-type {\n",
"outputs": [],
" vertical-align: middle;\n",
"source": [
" }\n",
".astype(str).dtypes"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c0a766f9-7959-4a10-b58f-cd946a878b60",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"../concepts/PHEN_summary_working.csv\")\n",
"cols = list(df.columns)\n",
"cols.remove('CONCEPT NAME ')\n",
"cols.remove('AGREED')\n",
"df = df.applymap(lambda x: str(x) if isinstance(x, (int, float)) else x) #change to int\n",
"\n",
"df_copy = df.rename(columns={\n",
" \"CONCEPT NAME \":\"concept_set_name\",\n",
" \"AGREED\":\"concept_set_status\"\n",
"})\n",
"df_copy[\"concept_set_status\"] = df_copy[\"concept_set_status\"].replace(\"USE\", \"AGREED\")\n",
"df_copy = df_copy[[\"concept_set_name\", \"concept_set_status\"]]\n",
"outs = df_copy.to_dict(orient='records')\n",
"\n",
"\n",
" .dataframe tbody tr th {\n",
"for i, out in enumerate(outs):\n",
" vertical-align: top;\n",
" out[\"metadata\"] = dict(df[cols].iloc[i])\n",
" }\n",
" \n",
" \n",
" .dataframe thead th {\n",
"json.dumps(outs)\n"
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>bnf_code</th>\n",
" <th>snomed_code</th>\n",
" <th>read2_code</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [bnf_code, snomed_code, read2_code]\n",
"Index: []"
]
]
},
},
"execution_count": 194,
{
"cell_type": "code",
"execution_count": null,
"id": "8a204a95-dc4c-4183-9ea7-f5c5e95e9087",
"metadata": {},
"metadata": {},
"output_type": "execute_result"
"outputs": [],
}
"source": []
],
},
{
"cell_type": "code",
"execution_count": null,
"id": "5ce1ab58-50b4-4c22-b72b-c698de6830f7",
"metadata": {},
"outputs": [],
"source": [
"source": [
"
df1.merge(df2, how='inner', on='snomed_code')
"
"
import json
"
]
]
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
180
,
"execution_count":
null
,
"id": "
d0cbadfe-ef55-40a8-a0f1-a9fc69d7456b
",
"id": "
f1ea81c6-d1db-408f-9d3a-b96f44efe21f
",
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [],
"source": []
},
{
{
"data": {
"cell_type": "markdown",
"text/html": [
"id": "5eb544a3-9dd1-41e8-88c2-a808646c6112",
"<div>\n",
"metadata": {
"<style scoped>\n",
"jp-MarkdownHeadingCollapsed": true,
" .dataframe tbody tr th:only-of-type {\n",
"tags": []
" vertical-align: middle;\n",
},
" }\n",
"source": [
"### OMOP Database"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c9e58e62-9e44-4d0c-9d8d-35c175c07e6c",
"metadata": {},
"outputs": [],
"source": [
"import sqlite3\n",
"import csv\n",
"import pandas as pd\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4f67c9a1-373f-4799-8a85-72767662d912",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "d0ecdf69-ee90-42c1-ad25-d8357b603d1b",
"metadata": {},
"outputs": [],
"source": [
"#IMPORT OMOP VOCABS\n",
"conn = sqlite3.connect(\"codes/omop_54.sqlite\") # change to 'sqlite:///your_filename.db'\n",
"folder_path = \"codes/vocabulary_download_v5_{9424944c-2b76-4127-8f05-f535e0f15e2a}_1731661390540\"\n",
"\n",
"# Check if the folder exists\n",
"if not os.path.isdir(folder_path):\n",
" raise Exception(f\"Error: The folder '{folder_path}' does not exist.\") \n",
"\n",
"# Iterate through files in the folder\n",
"for filename in os.listdir(folder_path):\n",
" if filename.endswith(\".csv\"): # Check if the file is a CSV\n",
" file_path = os.path.join(folder_path, filename)\n",
" try:\n",
" print(f\"Reading file: {file_path}\")\n",
" # Read the CSV file with the specified delimiter\n",
" df = pd.read_csv(file_path, delimiter=\"\\t\", low_memory=False)\n",
" table_name = os.path.splitext(os.path.basename(file_path))[0] #Get name of file\n",
" \n",
" #Export Table to sqlite db\n",
" df.to_sql(table_name, conn, if_exists='replace', index=False)\n",
" \n",
" except Exception as e:\n",
" raise Exception(f\"Error reading file {file_path}: {e}\")\n",
"\n",
"conn.commit()\n",
"conn.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b9cafd0c-a3bd-408b-bca8-b0de2acde1cd",
"metadata": {},
"outputs": [],
"source": [
"# Create a SQL connection to our SQLite database\n",
"conn = sqlite3.connect(\"codes/omop_54.sqlite\")\n",
"cur = conn.cursor()\n",
"\n",
"#Print ALL Columns in Table\n",
"# table=\"CONCEPT_SET\"\n",
"# cur.execute(f\"PRAGMA table_info({table});\")\n",
"# print(pd.DataFrame(cur.fetchall()))\n",
"\n",
"#Print ALL TABLE NAMES\n",
"# cur.execute(\"SELECT name FROM sqlite_master WHERE type='table' AND name=? ;\", (\"VOCABULARY\",))\n",
"# print(cur.fetchone())\n",
" \n",
"cur.execute(\"SELECT vocabulary_id FROM VOCABULARY WHERE vocabulary_id=? ;\", (\"MELDB\",))\n",
"print(cur.fetchone())\n",
"\n",
" \n",
" \n",
"#Print WHOLE TABLE\n",
"# cur.execute('SELECT * FROM CONCEPT;')\n",
"# cur.execute('SELECT * FROM CONCEPT WHERE standard_concept = \"C\";')\n",
"# cur.execute('SELECT * FROM CONCEPT WHERE concept_code = \"119768002\" LIMIT 1;')\n",
"# cur.execute('SELECT * FROM CONCEPT WHERE concept_code IN (\"119768002\", \"5905001\");')\n",
"# cur.execute('SELECT DISTINCT VOCABULARY_ID FROM CONCEPT;')\n",
"# df = pd.DataFrame(cur.fetchall())\n",
"# print(list(df[0]))\n",
"# display(df)\n",
"# for row in :\n",
" # print(row)\n",
"\n",
"\n",
"\n",
"#Get Header of Table\n",
"# table=\"CONCEPT_CLASS\"\n",
"# cur.execute(f\"SELECT * FROM {table} LIMIT 3;\")\n",
"# print(cur.fetchall())\n",
"\n",
"#create meldb VOCABULARY\n",
"# meldb_version='v3.2.10'\n",
"# meldb_description = 'Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity'\n",
"# meldb_reference = 'https://www.it-innovation.soton.ac.uk/projects/meldb'\n",
"# df_test = pd.DataFrame([{\n",
"# \"vocabulary_id\": 'MELDB',\n",
"# \"vocabulary_name\": meldb_description,\n",
"# \"vocabulary_reference\": meldb_reference,\n",
"# \"vocabulary_version\": meldb_version,\n",
"# # \"vocabulary_concept_id\": 0,\n",
"# }])\n",
"# df_test.to_sql(\"VOCABULARY\", conn, if_exists='append', index=False)\n",
"\n",
"\n",
"# cur.execute(\"\"\"\n",
"# CREATE TABLE CONCEPT_SET (\n",
"# concept_set_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each concept set\n",
"# atlas_id INTEGER, -- Unique identifier generated by ATLAS\n",
"# concept_set_name TEXT, -- Optional name for the concept set\n",
"# concept_set_description TEXT, -- Optional description for the concept set\n",
"# vocabulary_id TEXT NOT NULL, -- Foreign key to VOCABULARY table\n",
"# FOREIGN KEY (vocabulary_id) REFERENCES VOCABULARY(vocabulary_id)\n",
"# );\"\"\")\n",
"# cur.execute(\"DROP TABLE CONCEPT_SET;\")\n",
"\n",
"# cur.execute(\"\"\"\n",
"# CREATE TABLE CONCEPT_SET_ITEM (\n",
"# concept_set_item_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each mapping\n",
"# concept_set_id INTEGER NOT NULL, -- Foreign key to CONCEPT_SET table\n",
"# concept_id INTEGER NOT NULL, -- Foreign key to CONCEPT table\n",
"# FOREIGN KEY (concept_set_id) REFERENCES CONCEPT_SET(concept_set_id),\n",
"# FOREIGN KEY (concept_id) REFERENCES CONCEPT(concept_id)\n",
"# );\"\"\")\n",
"# cur.execute(\"DROP TABLE CONCEPT_SET_ITEM;\")\n",
"\n",
"# Be sure to close the connection\n",
"conn.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d03b75f3-902f-42d7-b52f-dac7e79ecb11",
"metadata": {},
"outputs": [],
"source": [
"conn = sqlite3.connect(\"codes/omop_54.sqlite\") # change to 'sqlite:///your_filename.db'\n",
"cur = conn.cursor()\n",
"\n",
"\n",
" .dataframe tbody tr th {\n",
"file_path = \"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/HEART_VALVE_DISORDERS.csv\"\n",
" vertical-align: top;\n",
"df = pd.read_csv(file_path, low_memory=False)\n",
" }\n",
"df = df.set_index(\"code\")\n",
"\n",
"df.to_sql(name='test', con=conn, if_exists='replace')\n",
"\n",
"conn.commit()\n",
"conn.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d96c3511-3831-400e-ba40-0a36abcc60d3",
"metadata": {},
"outputs": [],
"source": [
"#DISPLAY SQL TABLE\n",
"table=\"CONCEPT_SET_ITEM\"\n",
"\n",
"# Create a SQL connection to our SQLite database\n",
"conn = sqlite3.connect(\"codes/omop_54.sqlite\")\n",
"cur = conn.cursor()\n",
"\n",
"#Print ALL Columns in Table\n",
"cur.execute(f\"PRAGMA table_info({table});\")\n",
"df_cols = pd.DataFrame(cur.fetchall())\n",
"print(df_cols)\n",
"df_cols = df_cols[1]\n",
"\n",
"#Print TABLE\n",
"cur.execute(f\"SELECT * FROM {table};\")\n",
"df = pd.DataFrame(cur.fetchall())\n",
"df = df.rename(columns={i:s for i, s in enumerate(df_cols)})\n",
"display(df)\n",
"\n",
"conn.close()\n",
"\n",
"\n",
"# a+s = 13364 \n",
"# a+s+i = 13591\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "42d49a00-9646-4ba4-afb6-12297289b7a7",
"metadata": {},
"outputs": [],
"source": [
"def sql_row_exist(conn, table, column, value):\n",
"\t# Execute and check if a result exists\n",
"\tcur = conn.cursor()\n",
"\tquery = f\"SELECT 1 FROM {table} WHERE {column} = ? LIMIT 1;\"\n",
"\tcur.execute(query, (value,))\n",
"\texists = cur.fetchone() is not None\n",
"\t\n",
"\treturn exists"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f7b51bcd-6ee1-4023-8d36-7f419ce4120d",
"metadata": {},
"outputs": [],
"source": [
"#EXPORT MELDB CSV OUTPUT\n",
"\n",
"conn = sqlite3.connect(\"codes/omop_54.sqlite\") # change to 'sqlite:///your_filename.db'\n",
"cur = conn.cursor()\n",
"\n",
"vocab_output = \"MELDB\"\n",
"vocab_type = \"SNOMED\"\n",
"file_path = \"/home/jjd1c23/ssd/meldb/jjd1c23/phenotype/output/V3_2_10_MELD_snomed_no_translate.csv\"\n",
"# file_path = \"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/HEART_VALVE_DISORDERS.csv\"\n",
"\n",
"# Read the CSV file with the specified delimiter\n",
"out = pd.read_csv(file_path, low_memory=False)\n",
"print(df.columns)\n",
"\n",
"for concept_set_name, grp in out.groupby(\"MELDB_concept\"):\n",
" # display(concept_set_name, grp[[\"code\", \"MELDB_concept\"]])\n",
" \n",
" #Create Concept_Set\n",
" if not sql_row_exist(conn, \"CONCEPT_SET\", \"concept_set_name\", concept_set_name):\n",
" cur.execute(f\"INSERT INTO CONCEPT_SET (concept_set_name, vocabulary_id) VALUES ('{concept_set_name}', 'MELDB');\")\n",
" else:\n",
" print(\"concept_set\", concept_set_name, \"already exists\")\n",
" #TODO: ask to remove old concept_set?\n",
" \n",
" #Get Concept_set_Id\n",
" query = \"SELECT concept_set_id FROM CONCEPT_SET WHERE concept_set_name = ? AND vocabulary_id = ?;\"\n",
" cur.execute(query, (concept_set_name, vocab_output, )) \n",
" concept_set_id = cur.fetchone()[0]\n",
" \n",
" \n",
" .dataframe thead th {\n",
" #Get corresponing Concept_id (OMOP) for each Concept_code (e.g. SNOMED)\n",
" text-align: right;\n",
" concept_codes = \"'\"+\"', '\".join(list(grp[\"code\"].astype(str)))+\"'\"\n",
" query = f\"SELECT concept_id FROM CONCEPT WHERE vocabulary_id = ? AND concept_code IN ({concept_codes});\"\n",
" print(query)\n",
" cur.execute(query, (vocab_type, ))\n",
" df_out = pd.DataFrame(cur.fetchall(), columns=[\"concept_id\"])\n",
" \n",
" if not len(grp) == len(df_out):\n",
" print(\"ERROR: Some\", vocab_type, \"Codes do not exist in OMOP Database\")\n",
" \n",
" #Create Concept_set_item\n",
" df_out[\"concept_set_id\"] = concept_set_id\n",
" df_out.to_sql(\"CONCEPT_SET_ITEM\", conn, if_exists='append', index=False)\n",
" \n",
" display(df_out)\n",
" \n",
" \n",
" \n",
" # break\n",
" \n",
" \n",
"\n",
"# #Create New CONCEPT_SET\n",
"# table_name = os.path.splitext(os.path.basename(file_path))[0] #Get name of file\n",
"# cur.execute(f\"INSERT INTO CONCEPT_SET (concept_class_name) VALUES ('{table_name}');\")\n",
" \n",
" \n",
" \n",
"\n",
" \n",
" \n",
"\n",
"\n",
"conn.commit()\n",
"conn.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "85007741-e34c-4112-a63c-9fb302b76958",
"metadata": {},
"outputs": [],
"source": [
"\"'\"+\"', '\".join(list(grp[\"code\"].astype(str)))+\"'\""
]
},
{
"cell_type": "markdown",
"id": "423e7c21-f3bd-439d-9dcb-c17cc2cc6854",
"metadata": {
"jp-MarkdownHeadingCollapsed": true,
"tags": []
},
"source": [
"### ATLAS"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c6b45e4d-c7d2-42e7-9b4a-0e9c1c86d34b",
"metadata": {},
"outputs": [],
"source": [
"#Create ATLAS Concept Set\n",
"\n",
"def atlas_create_concept(name, description=\"\", items=[]):\n",
" data={\n",
" \"id\": 0,\n",
" \"name\": name,\n",
" \"description\": description,\n",
" \"expression\": {\n",
" \"items\":items \n",
" }\n",
" }\n",
"</style>\n",
" }\n",
"<table border=\"1\" class=\"dataframe\">\n",
"\n",
" <thead>\n",
" try:\n",
" <tr style=\"text-align: right;\">\n",
" # Sending the POST request\n",
" <th></th>\n",
" response = requests.post(url, json=data, headers=headers)\n",
" <th>bnf_code</th>\n",
"\n",
" <th>snomed_code</th>\n",
" # Check the response status\n",
" <th>read2_code</th>\n",
" if response.status_code == 200 or response.status_code == 201:\n",
" </tr>\n",
" print(\"POST request successful:\")\n",
" </thead>\n",
" print(response.json()) # Assuming the response is JSON\n",
" <tbody>\n",
" return response[\"id\"]\n",
" </tbody>\n",
" else:\n",
"</table>\n",
" print(f\"POST request failed. HTTP Status Code: {response.status_code}\")\n",
"</div>"
" print(\"Response content:\")\n",
],
" print(response.text)\n",
"text/plain": [
" return None\n",
"Empty DataFrame\n",
"\n",
"Columns: [bnf_code, snomed_code, read2_code]\n",
" except requests.exceptions.RequestException as e:\n",
"Index: []"
" print(f\"An error occurred: {e}\")\n",
"\n",
"# Heart Test 1 - 1885487\n",
"# Heart Test 2 - 1885488\n",
"# Heart Valve Disorders - 1885449\n",
"\n"
]
]
},
},
"execution_count": 180,
{
"cell_type": "code",
"execution_count": null,
"id": "45497623-1da0-4f74-b21e-da8811c89b04",
"metadata": {},
"metadata": {},
"output_type": "execute_result"
"outputs": [],
}
"source": [
],
"def get_omop_concepts(cur, codes, vocab_id): \n",
"source": []
" #Create List for SQL\n",
" mask = \"\"\n",
" for c in codes:\n",
" mask+=f'\"{c}\", '\n",
" mask = mask[:-2] #remove last comma\n",
" \n",
" #Execute SQL\n",
" cur.execute(f'SELECT * FROM CONCEPT WHERE concept_code IN ({mask}) AND VOCABULARY_ID = \"{vocab_id}\";')\n",
" df = pd.DataFrame(cur.fetchall()) #convert to pandas df\n",
" \n",
" print(\"Identified\", len(df[0]) ,\"OMOP Concepts:\", list(df[0]))\n",
" \n",
" return df\n",
" \n",
"def omop_concepts_to_atlas_json(df):\n",
" json = []\n",
" for i, row in df.iterrows():\n",
" #template for atlas api\n",
" out = { \n",
" \"concept\": {\n",
" 'CONCEPT_ID': row[0],\n",
" 'CONCEPT_NAME': row[1],\n",
" 'STANDARD_CONCEPT': 'S',\n",
" 'STANDARD_CONCEPT_CAPTION': 'Standard',\n",
" 'INVALID_REASON': 'V',\n",
" 'INVALID_REASON_CAPTION': 'Valid',\n",
" 'CONCEPT_CODE': row[6],\n",
" 'DOMAIN_ID': row[2],\n",
" 'VOCABULARY_ID': row[3],\n",
" 'CONCEPT_CLASS_ID': row[4],\n",
" 'VALID_START_DATE': int(row[7]),\n",
" 'VALID_END_DATE': int(row[8])\n",
" },\n",
" 'isExcluded': False,\n",
" 'includeDescendants': False,\n",
" 'includeMapped': False\n",
" }\n",
" json.append(out)\n",
" return json \n",
"\n",
"conn = sqlite3.connect(\"codes/omop_54.sqlite\")\n",
"cur = conn.cursor()\n",
"\n",
"vocab_id=\"SNOMED\" #SNOMED, ATC, ICD10CM, ICD9CM, Read\n",
"csv_output = \"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/ANGER.csv\"\n",
"\n",
"#Load CSV Output File\n",
"df_in = pd.read_csv(csv_output)\n",
"print(len(df_in))\n",
"\n",
"# df = get_omop_concepts(cur, [\"119768002\", \"5905001\"], \"SNOMED\")\n",
"df = get_omop_concepts(cur, list(df_in[\"code\"]), vocab_id)\n",
"json = omop_concepts_to_atlas_json(df)\n",
"# display(json)\n",
"\n",
"conn.close()"
]
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
164
,
"execution_count":
null
,
"id": "
b3166cf0-e4a5-43e0-aeac-78827427422e
",
"id": "
ea759907-c085-472a-82e2-07b6b19e2c8f
",
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [],
"source": [
"#ATLAS GET CONCEPT SET\n",
"import requests\n",
"\n",
"def request_get(url):\n",
" try:\n",
" # Sending the GET request\n",
" response = requests.get(url)\n",
"\n",
" # Check if the response status code is 200 (OK)\n",
" if response.status_code == 200:\n",
" print(\"Response data:\")\n",
" # print(response.json()) # Assuming the response is in JSON format\n",
" return response.json()\n",
" else:\n",
" print(f\"Failed to fetch data. HTTP Status Code: {response.status_code}\")\n",
" print(\"Response content:\")\n",
" print(response.text)\n",
" return None\n",
"\n",
" except requests.exceptions.RequestException as e:\n",
" print(f\"An error occurred: {e}\")\n",
"\n",
"\n",
"#GET SET INFO\n",
"set_id = \"1885449\"\n",
"url = f\"https://atlas-demo.ohdsi.org/WebAPI/conceptset/{set_id}\"\n",
"request_get(url)"
]
},
{
{
"data": {
"cell_type": "code",
"text/plain": [
"execution_count": null,
"BNF Code object\n",
"id": "5a70e636-6051-4930-bf1b-30d093fd0552",
"SNOMED Code object\n",
"metadata": {},
"dtype: object"
"outputs": [],
"source": [
"#GET SET ITEMS (Concepts)\n",
"set_id = \"1885449\"\n",
"url = f\"https://atlas-demo.ohdsi.org/WebAPI/conceptset/{set_id}/expression/ATLASPROD\"\n",
"response = request_get(url)\n",
"display(response)"
]
]
},
},
"execution_count": 164,
{
"cell_type": "code",
"execution_count": null,
"id": "96bfcd9c-27e8-4be4-a680-7553d908790e",
"metadata": {},
"metadata": {},
"output_type": "execute_result"
"outputs": [],
}
],
"source": [
"source": [
"
.astype(str).dtypes
"
"
#ATLAS CREATE CONCEPT SET\n
"
]
]
}
}
],
],
...
...
%% Cell type:code id:8c8f4cdf-04a5-4762-895e-6555781a1f05 tags:
%% Cell type:code id:8c8f4cdf-04a5-4762-895e-6555781a1f05 tags:
```
python
```
python
import
pandas
as
pd
import
pandas
as
pd
import
numpy
as
np
import
numpy
as
np
import
json
import
json
```
```
%% Cell type:markdown id:c5786d78-7dc2-4f02-ad21-cee95e473823 tags:
%% Cell type:markdown id:c5786d78-7dc2-4f02-ad21-cee95e473823 tags:
### Ho generate JSON
### Ho generate JSON
%% Cell type:code id:0292dc90-e31a-4724-8536-d0b55533aaef tags:
%% Cell type:code id:0292dc90-e31a-4724-8536-d0b55533aaef tags:
```
python
```
python
#List v4 to json
#List v4 to json
df
=
pd
.
read_excel
(
"
PHEN_code_lists_sources_V4.xlsx
"
,
sheet_name
=
"
ho
"
,
dtype
=
str
)
df
=
pd
.
read_excel
(
"
PHEN_code_lists_sources_V4.xlsx
"
,
sheet_name
=
"
ho
"
,
dtype
=
str
)
# df = df.sort_values(by="mapped_condition")
# df = df.sort_values(by="mapped_condition")
def
json_file_template
(
file
,
cons
,
types
,
metadata
):
def
json_file_template
(
file
,
cons
,
types
,
metadata
):
concepts
=
""
concepts
=
""
for
concept
in
cons
:
for
concept
in
cons
:
concepts
+=
f
'"
{
concept
}
"
,
'
concepts
+=
f
'"
{
concept
}
"
,
'
concepts
=
concepts
[:
-
2
]
#remove last ,
concepts
=
concepts
[:
-
2
]
#remove last ,
type_str
=
""
type_str
=
""
for
k
,
v
in
types
.
items
():
for
k
,
v
in
types
.
items
():
type_str
+=
f
'"
{
k
}
"
:
"
{
v
}
"
,
'
type_str
+=
f
'"
{
k
}
"
:
"
{
v
}
"
,
'
type_str
=
type_str
[:
-
2
]
type_str
=
type_str
[:
-
2
]
meta_str
=
'"
metadata
"
:[
'
meta_str
=
'"
metadata
"
:[
'
for
v
in
metadata
:
for
v
in
metadata
:
meta_str
+=
f
'"
{
v
}
"
,
'
meta_str
+=
f
'"
{
v
}
"
,
'
meta_str
=
meta_str
[:
-
2
]
meta_str
=
meta_str
[:
-
2
]
meta_str
=
meta_str
+
"
]
"
meta_str
=
meta_str
+
"
]
"
return
'''
return
'''
{
{
\"
file
\"
:
\"
'''
+
file
+
'''"
,
\"
file
\"
:
\"
'''
+
file
+
'''"
,
\"
columns
\"
:{
\"
columns
\"
:{
'''
+
type_str
+
'''
,
'''
+
type_str
+
'''
,
'''
+
meta_str
+
'''
'''
+
meta_str
+
'''
},
},
\"
meldb_phenotypes
\"
:[
'''
+
concepts
+
'''
]
\"
meldb_phenotypes
\"
:[
'''
+
concepts
+
'''
]
},
'''
},
'''
out
=
'"
files
"
:[
'
out
=
'"
files
"
:[
'
folder
=
"
codes/GitHub_TG_repository/
"
folder
=
"
codes/GitHub_TG_repository/
"
for
file
,
grp
in
df
.
groupby
(
"
mapped_condition
"
):
for
file
,
grp
in
df
.
groupby
(
"
mapped_condition
"
):
file
=
file
.
replace
(
"
%20
"
,
"
"
)
file
=
file
.
replace
(
"
%20
"
,
"
"
)
for
ext
in
[
"
_CPRD_GOLD.csv
"
,
"
_CPRD_AURUM.csv
"
,
"
_IMRD.csv
"
]:
for
ext
in
[
"
_CPRD_GOLD.csv
"
,
"
_CPRD_AURUM.csv
"
,
"
_IMRD.csv
"
]:
path
=
file
+
"
/
"
+
file
+
ext
path
=
file
+
"
/
"
+
file
+
ext
if
os
.
path
.
isfile
(
folder
+
path
):
if
os
.
path
.
isfile
(
folder
+
path
):
out
+=
json_file_template
(
path
,
grp
[
"
meldb_condition
"
],
out
+=
json_file_template
(
path
,
grp
[
"
meldb_condition
"
],
types
=
{
types
=
{
"
read2_code
"
:
"
READ_CODE
"
,
"
read2_code
"
:
"
READ_CODE
"
,
"
snomed_code
"
:
"
SNOMED_CT_CODE
"
,
"
snomed_code
"
:
"
SNOMED_CT_CODE
"
,
# "med_code":"MEDICAL_CODE_ID",
# "med_code":"MEDICAL_CODE_ID",
},
},
metadata
=
[
"
DESCRIPTION
"
]
metadata
=
[
"
DESCRIPTION
"
]
)
)
else
:
else
:
print
(
"
NOT FILE
"
,
folder
+
path
)
print
(
"
NOT FILE
"
,
folder
+
path
)
for
ext
in
[
"
_ICD10.csv
"
]:
for
ext
in
[
"
_ICD10.csv
"
]:
path
=
file
+
"
/
"
+
file
+
ext
path
=
file
+
"
/
"
+
file
+
ext
if
os
.
path
.
isfile
(
folder
+
path
):
if
os
.
path
.
isfile
(
folder
+
path
):
out
+=
json_file_template
(
path
,
grp
[
"
meldb_condition
"
],
out
+=
json_file_template
(
path
,
grp
[
"
meldb_condition
"
],
types
=
{
types
=
{
"
icd10_code
"
:
"
READ_CODE
"
,
"
icd10_code
"
:
"
READ_CODE
"
,
"
snomed_code
"
:
"
SNOMED_CT_CODE
"
,
"
snomed_code
"
:
"
SNOMED_CT_CODE
"
,
# "icd10_code":"MEDICAL_CODE_ID",
# "icd10_code":"MEDICAL_CODE_ID",
},
},
metadata
=
[
"
DESCRIPTION
"
]
metadata
=
[
"
DESCRIPTION
"
]
)
)
else
:
else
:
print
(
"
NOT FILE
"
,
folder
+
path
)
print
(
"
NOT FILE
"
,
folder
+
path
)
# out+= json_file_template(file+"/"+file+"_CPRD_AURUM.csv", grp["meldb_condition"])
# out+= json_file_template(file+"/"+file+"_CPRD_AURUM.csv", grp["meldb_condition"])
# out+= json_file_template(file+"/"+file+"_ICD10.csv", grp["meldb_condition"])
# out+= json_file_template(file+"/"+file+"_ICD10.csv", grp["meldb_condition"])
# out+= json_file_template(file+"/"+file+"_IMRD.csv", grp["meldb_condition"])
# out+= json_file_template(file+"/"+file+"_IMRD.csv", grp["meldb_condition"])
# out += f' "{file}/{file}_CPRD_GOLD.csv":[{conds}],\n'
# out += f' "{file}/{file}_CPRD_GOLD.csv":[{conds}],\n'
# out += f' "{file}/{file}_CPRD_AURUM.csv":[{conds}],\n'
# out += f' "{file}/{file}_CPRD_AURUM.csv":[{conds}],\n'
# out += f' "{file}/{file}_ICD10.csv":[{conds}],\n'
# out += f' "{file}/{file}_ICD10.csv":[{conds}],\n'
# out += f' "{file}/{file}_IMRD.csv":[{conds}],\n'
# out += f' "{file}/{file}_IMRD.csv":[{conds}],\n'
out
=
out
[:
-
1
]
#remove last ,
out
=
out
[:
-
1
]
#remove last ,
out
+=
"
\n
]
"
out
+=
"
\n
]
"
out
=
out
.
replace
(
"
%20
"
,
"
"
)
out
=
out
.
replace
(
"
%20
"
,
"
"
)
print
(
out
)
print
(
out
)
```
```
%% Cell type:code id:f155b635-b459-4aff-81b2-e065fc223858 tags:
%% Cell type:code id:f155b635-b459-4aff-81b2-e065fc223858 tags:
```
python
```
python
``
`
``
`
%%
Output
0
False
dtype
:
bool
%%
Cell
type
:
code
id
:
d040eda5
-
4028
-
4047
-
834
c
-
7315e307
e415
tags
:
%%
Cell
type
:
code
id
:
d040eda5
-
4028
-
4047
-
834
c
-
7315e307
e415
tags
:
```
python
```
python
df = pd.read_parquet("maps/processed/icd10_code.parquet")
df = pd.read_parquet("maps/processed/icd10_code.parquet")
df
df
```
```
%% Output
icd10_code icd10_alt_code \
0 A00 A00
1 A00.0 A000
2 A00.1 A001
3 A00.9 A009
4 A01 A01
... ... ...
17929 U84.3 U843
17930 U84.7 U847
17931 U84.8 U848
17932 U84.9 U849
17933 U85 U85X
description
0 Cholera
1 Cholera due to Vibrio cholerae 01, biovar chol...
2 Cholera due to Vibrio cholerae 01, biovar eltor
3 Cholera, unspecified
4 Typhoid and paratyphoid fevers
... ...
17929 Resistance to tuberculostatic drug(s)
17930 Resistance to multiple antimicrobial drugs
17931 Resistance to other specified antimicrobial drug
17932 Resistance to unspecified antimicrobial drugs
17933 Resistance to antineoplastic drugs
[17934 rows x 3 columns]
%% Cell type:code id:e0228ac9-8852-4818-b7f0-98429ca5229c tags:
%% Cell type:code id:e0228ac9-8852-4818-b7f0-98429ca5229c tags:
```
python
```
python
code = ["A00.0", "
*
00.0"]
code = ["A00.0", "
*
00.0"]
code = pd.Series(code)
code = pd.Series(code)
print(code.isin(df["icd10_code"]))
print(code.isin(df["icd10_code"]))
print(code.isin(df["icd10_alt_code"]))
print(code.isin(df["icd10_alt_code"]))
# print( )
# print( )
~(
~(
~code.isin(df["icd10_code"])
~code.isin(df["icd10_code"])
&
&
~code.isin(df["icd10_alt_code"])
~code.isin(df["icd10_alt_code"])
)
)
```
```
%% Output
0 True
1 False
dtype: bool
0 False
1 False
dtype: bool
0 True
1 False
dtype: bool
%% Cell type:markdown id:18efcacd-45f0-4341-86cc-d8e2e584350c tags:
%% Cell type:markdown id:18efcacd-45f0-4341-86cc-d8e2e584350c tags:
### Analyse the JSON file
### Analyse the JSON file
%% Cell type:code id:85dc197b-451e-4fa9-a53b-e6770c132123 tags:
```
python
import json
import os
path_json = "../concepts/PHEN_assign_v3.json"
#Load JSON Concept Definitions
mapping = json.load(open(path_json,'rb'))
summary_config = mapping
[
"concept_sets"
][
"concept_set"
]
summary_df = pd.DataFrame(summary_config) #change to dataframe
summary_df = summary_df.join(pd.json_normalize(summary_df["metadata"])) #metadata to columns
summary_df = summary_df.drop(columns=["metadata"])
summary_df = summary_df.rename(columns={"concept_set_name":"CONCEPT_SET"})
summary_df = summary_df.drop_duplicates() #remove duplicates
summary_df
```
%% Cell type:code id:4c9b6b3f-08aa-4f61-b9b2-44a24b5d00a0 tags:
%% Cell type:code id:4c9b6b3f-08aa-4f61-b9b2-44a24b5d00a0 tags:
```
python
```
python
import json
import json
import os
import os
path_json = "PHEN_assign_v3.json"
path_json = "PHEN_assign_v3.json"
path_excel = "PHEN_summary_working.xlsx"
path_excel = "PHEN_summary_working.xlsx"
path_codes = "codes/"
path_codes = "codes/"
#Get all Files in JSON
#Get all Files in JSON
def get_json_files(path_json):
def get_json_files(path_json):
folders = json.load(open(path_json,'rb'))
folders = json.load(open(path_json,'rb'))
out = []
out = []
for folder in folders:
for folder in folders:
if "files" in folder:
if "files" in folder:
for file in folder["files"]:
for file in folder["files"]:
file_path = folder["folder"]+"/"+file["file"]
file_path = folder["folder"]+"/"+file["file"]
if "meldb_phenotypes" in file:
if "meldb_phenotypes" in file:
for concept in file["meldb_phenotypes"]:
for concept in file["meldb_phenotypes"]:
out.append({"json_concept":concept, "filepath":file_path, "json_code_types":list(file["columns"].keys())})
out.append({"json_concept":concept, "filepath":file_path, "json_code_types":list(file["columns"].keys())})
elif "meldb_phenotypes_categories" in file:
elif "meldb_phenotypes_categories" in file:
for code, concept in file["meldb_phenotypes_categories"].items():
for code, concept in file["meldb_phenotypes_categories"].items():
out.append({"json_concept":concept[0], "filepath":file_path, "json_code_types":list(file["columns"].keys())})
out.append({"json_concept":concept[0], "filepath":file_path, "json_code_types":list(file["columns"].keys())})
else:
else:
out.append({"json_concept":None, "filepath":file_path})
out.append({"json_concept":None, "filepath":file_path})
out = pd.DataFrame(out)
out = pd.DataFrame(out)
out["filepath"] = out["filepath"].astype(str)
out["filepath"] = out["filepath"].astype(str)
return out
return out
out = get_json_files(path_json)
out = get_json_files(path_json)
#Get all Files Excel Summary
#Get all Files Excel Summary
def get_excel_files(path_excel):
def get_excel_files(path_excel):
path_excel = "PHEN_summary_working.xlsx"
path_excel = "PHEN_summary_working.xlsx"
out2 = pd.read_excel(path_excel)
out2 = pd.read_excel(path_excel)
out2 = out2[["CONCEPT NAME ", "CODING LIST", "AGREED", "FUNCTION"]].loc[1:] #select relevant columns
out2 = out2[["CONCEPT NAME ", "CODING LIST", "AGREED", "FUNCTION"]].loc[1:] #select relevant columns
#Filter Concepts in use
#Filter Concepts in use
out2 = out2[out2["AGREED"] == "USE"] #remove deprecated concepts
out2 = out2[out2["AGREED"] == "USE"] #remove deprecated concepts
out2 = out2[out2["FUNCTION"] == "QUERY BY CODING LIST"] #remove deprecated concepts
out2 = out2[out2["FUNCTION"] == "QUERY BY CODING LIST"] #remove deprecated concepts
out2 = out2.drop(['AGREED', 'FUNCTION'], axis=1)
out2 = out2.drop(['AGREED', 'FUNCTION'], axis=1)
#Get filepaths
#Get filepaths
out2["CODING LIST"] = out2["CODING LIST"].str.split(",") #split by ,
out2["CODING LIST"] = out2["CODING LIST"].str.split(",") #split by ,
out2 = out2.explode("CODING LIST") #one row per file
out2 = out2.explode("CODING LIST") #one row per file
out2["CODING LIST"] = out2["CODING LIST"].str.strip()
out2["CODING LIST"] = out2["CODING LIST"].str.strip()
out2["CODING LIST"] = out2["CODING LIST"].str.replace("https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/", "")
out2["CODING LIST"] = out2["CODING LIST"].str.replace("https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/", "")
out2["CODING LIST"] = out2["CODING LIST"].str.replace("%20", " ")
out2["CODING LIST"] = out2["CODING LIST"].str.replace("%20", " ")
out2 = out2.rename(columns={"CONCEPT NAME ":"excel_concept", "CODING LIST":"filepath"})
out2 = out2.rename(columns={"CONCEPT NAME ":"excel_concept", "CODING LIST":"filepath"})
return out2
return out2
out2 = get_excel_files(path_excel)
out2 = get_excel_files(path_excel)
#Get all Files in /codes
#Get all Files in /codes
def get_code_files(path_codes):
def get_code_files(path_codes):
all_files = []
all_files = []
for root, dirs, files in os.walk(path_codes, topdown=False):
for root, dirs, files in os.walk(path_codes, topdown=False):
for name in files:
for name in files:
if ".ipynb_checkpoint" not in root: #exclude notebook checkpoints
if ".ipynb_checkpoint" not in root: #exclude notebook checkpoints
if name.endswith(".csv") or name.endswith(".xlsx") or name.endswith(".dta"): #exclude non-data files
if name.endswith(".csv") or name.endswith(".xlsx") or name.endswith(".dta"): #exclude non-data files
all_files.append(os.path.join(root, name))
all_files.append(os.path.join(root, name))
all_files = pd.DataFrame(all_files)
all_files = pd.DataFrame(all_files)
all_files = all_files.rename(columns={0:"filepath"})
all_files = all_files.rename(columns={0:"filepath"})
all_files["filepath"] = all_files["filepath"].astype(str)
all_files["filepath"] = all_files["filepath"].astype(str)
return all_files
return all_files
all_files = get_code_files(path_codes)
all_files = get_code_files(path_codes)
print("ALL FILES", len(all_files), len(all_files["filepath"].unique()))
print("ALL FILES", len(all_files), len(all_files["filepath"].unique()))
print("JSON CONCEPTS", len(out), len(out["filepath"].unique()))
print("JSON CONCEPTS", len(out), len(out["filepath"].unique()))
print("EXCEL CONCEPTS", len(out2), len(out2["filepath"].unique()))
print("EXCEL CONCEPTS", len(out2), len(out2["filepath"].unique()))
outs = pd.merge(all_files, out, how="outer", on="filepath")
outs = pd.merge(all_files, out, how="outer", on="filepath")
outs = pd.merge(outs, out2, how="outer", on="filepath")
outs = pd.merge(outs, out2, how="outer", on="filepath")
print(len(outs), len(outs["filepath"].unique()))
print(len(outs), len(outs["filepath"].unique()))
outs.to_csv("output/MELD_file_to_concept.csv", index=False)
outs.to_csv("output/MELD_file_to_concept.csv", index=False)
# display(outs[ outs["concept"].isna()])
# display(outs[ outs["concept"].isna()])
# display(out )
# display(out )
```
```
%% Output
ALL FILES 878 878
JSON CONCEPTS 436 397
EXCEL CONCEPTS 440 397
1755 878
/opt/conda/lib/python3.9/site-packages/openpyxl/worksheet/_reader.py:329: UserWarning: Data Validation extension is not supported and will be removed
warn(msg)
%% Cell type:code id:f8e70c33-c869-46f8-953e-f6b52992cfbb tags:
%% Cell type:code id:f8e70c33-c869-46f8-953e-f6b52992cfbb tags:
```
python
```
python
display("JSON MISSING", outs[outs["json_concept"].isna() & outs["excel_concept"].notna()])
display("JSON MISSING", outs[outs["json_concept"].isna() & outs["excel_concept"].notna()])
display("EXCEL MISSING", outs[outs["json_concept"].notna() & outs["excel_concept"].isna()])
display("EXCEL MISSING", outs[outs["json_concept"].notna() & outs["excel_concept"].isna()])
```
```
%% Output
%% Cell type:code id:9d84465f-f064-4df2-b0e4-2dfb217aea21 tags:
%% Cell type:code id:9d84465f-f064-4df2-b0e4-2dfb217aea21 tags:
```
python
```
python
f = open('concepts-output/MELD-report.md', 'a') as f:
f = open('concepts-output/MELD-report.md', 'a') as f:
f.write(
f.write(
"""
"""
# Report
# Report
-
One thing
-
One thing
-
Two thing
-
Two thing
-
Three thing
-
Three thing
""")
""")
```
```
%% Cell type:code id:7f7fc771-e406-42c7-8a09-16a20b5298f5 tags:
%% Cell type:code id:7f7fc771-e406-42c7-8a09-16a20b5298f5 tags:
```
python
```
python
total_length = 0
total_length = 0
for file in all_files["filepath"]:
for file in all_files["filepath"]:
if file.endswith(".csv"):
if file.endswith(".csv"):
df_file = pd.read_csv(file)
df_file = pd.read_csv(file)
total_length += len(df_file)
total_length += len(df_file)
elif file.endswith(".xlsx"):
elif file.endswith(".xlsx"):
df_file = pd.read_excel(file)
df_file = pd.read_excel(file)
total_length += len(df_file)
total_length += len(df_file)
elif file.endswith(".dta"):
elif file.endswith(".dta"):
df_file = pd.read_stata(file)
df_file = pd.read_stata(file)
total_length += len(df_file)
total_length += len(df_file)
total_length
total_length
```
```
%% Output
65307
%% Cell type:code id:08a9c565-28d6-46ee-9fa8-6fa0ee28a4d5 tags:
%% Cell type:code id:08a9c565-28d6-46ee-9fa8-6fa0ee28a4d5 tags:
```
python
```
python
#turn filepaths into gitlab links
#turn filepaths into gitlab links
outs2 = outs.copy()
outs2 = outs.copy()
outs2["filepath"] = "https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/"+outs2["filepath"].str.replace(" ", "%20")
outs2["filepath"] = "https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/"+outs2["filepath"].str.replace(" ", "%20")
#Groupby concepts and concat filepaths
#Groupby concepts and concat filepaths
outs2 = outs2.groupby("concept")["filepath"].apply(', '.join).reset_index()
outs2 = outs2.groupby("concept")["filepath"].apply(', '.join).reset_index()
outs2 = outs2.sort_values(by=["concept"])
outs2 = outs2.sort_values(by=["concept"])
outs2
outs2
outs2.to_csv("output/MELD_GitLab_link_to_concept.csv", index=False)
outs2.to_csv("output/MELD_GitLab_link_to_concept.csv", index=False)
```
```
%% Cell type:markdown id:357bb84c-90c2-4b5f-95c0-443191783a7f tags:
%% Cell type:markdown id:357bb84c-90c2-4b5f-95c0-443191783a7f tags:
### Analyse Output Files
### Analyse Output Files
%% Cell type:code id:7d3f9cb7-be86-4f1f-92f6-991094eb7bb7 tags:
%% Cell type:code id:7d3f9cb7-be86-4f1f-92f6-991094eb7bb7 tags:
```
python
```
python
version = "V2_2_2"
version = "V2_2_2"
output_files = [f"output/{version}_MELD_concepts_readv2.csv",
output_files = [f"output/{version}_MELD_concepts_readv2.csv",
f"output/{version}_MELD_snomed_no_translate.csv",
f"output/{version}_MELD_snomed_no_translate.csv",
f"output/{version}_MELD_icd10_no_translate.csv",
f"output/{version}_MELD_icd10_no_translate.csv",
# f"output/{version}_MELD_med_no_translate.csv",
# f"output/{version}_MELD_med_no_translate.csv",
f"output/{version}_MELD_atc_no_translate.csv"
f"output/{version}_MELD_atc_no_translate.csv"
]
]
error_file = f"output/{version}_MELD_errors.csv"
error_file = f"output/{version}_MELD_errors.csv"
for output_file in output_files:
for output_file in output_files:
print("---"
*3,output_file,"---"*
3,)
print("---"
*3,output_file,"---"*
3,)
df = pd.read_csv(output_file)
df = pd.read_csv(output_file)
# df["MELDB_concept"].loc[df["CONCEPT TYPE"].isna()]
# df["MELDB_concept"].loc[df["CONCEPT TYPE"].isna()]
print("MELDB missing concepts ", len(df[df["CONCEPT TYPE"].isna()]))
print("MELDB missing concepts ", len(df[df["CONCEPT TYPE"].isna()]))
if df["code"].dtype == "object":
if df["code"].dtype == "object":
print("Chars present:", np.sort(df["code"].apply(lambda x : set(x)).explode().unique()))
print("Chars present:", np.sort(df["code"].apply(lambda x : set(x)).explode().unique()))
# len(df["MELDB_concept"].unique())
# len(df["MELDB_concept"].unique())
print("---"
*3,error_file,"---"*
3,)
print("---"
*3,error_file,"---"*
3,)
df = pd.read_csv(error_file)
df = pd.read_csv(error_file)
df = df.drop_duplicates()
df = df.drop_duplicates()
df["CODE_TYPE"].value_counts()
df["CODE_TYPE"].value_counts()
# for i, row in df.drop_duplicates().iterrows():
# for i, row in df.drop_duplicates().iterrows():
# print(row["CODE"], row["CODE_TYPE"])
# print(row["CODE"], row["CODE_TYPE"])
```
```
%% Output
--------- output/V2_2_2_MELD_concepts_readv2.csv ---------
MELDB missing concepts 0
Chars present: ['.' '0' '1' '2' '3' '4' '5' '6' '7' '8' '9' 'A' 'B' 'C' 'D' 'E' 'F' 'G'
'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' 'X' 'Y'
'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q'
'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']
--------- output/V2_2_2_MELD_snomed_no_translate.csv ---------
MELDB missing concepts 0
--------- output/V2_2_2_MELD_icd10_no_translate.csv ---------
MELDB missing concepts 0
Chars present: ['0' '1' '2' '3' '4' '5' '6' '7' '8' '9' 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H'
'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'T' 'W' 'X' 'Y' 'Z']
--------- output/V2_2_2_MELD_atc_no_translate.csv ---------
MELDB missing concepts 0
Chars present: ['0' '1' '2' '3' '6' 'A' 'F' 'N' 'X']
--------- output/V2_2_2_MELD_errors.csv ---------
CODE_TYPE
snomed_code 1261
read2_code 464
read3_code 80
icd10_code 1
Name: count, dtype: int64
%% Cell type:code id:08e0ecc1-9271-48c3-9c5b-094800072906 tags:
%% Cell type:code id:08e0ecc1-9271-48c3-9c5b-094800072906 tags:
```
python
```
python
def get_output_files(version):
def get_output_files(version):
output_files = [f"output/{version}_MELD_concepts_readv2.csv",
output_files = [f"output/{version}_MELD_concepts_readv2.csv",
f"output/{version}_MELD_snomed_no_translate.csv",
f"output/{version}_MELD_snomed_no_translate.csv",
f"output/{version}_MELD_icd10_no_translate.csv",
f"output/{version}_MELD_icd10_no_translate.csv",
# f"output/{version}_MELD_med_no_translate.csv",
# f"output/{version}_MELD_med_no_translate.csv",
f"output/{version}_MELD_atc_no_translate.csv"
f"output/{version}_MELD_atc_no_translate.csv"
]
]
error_file = f"output/{version}_MELD_errors.csv"
error_file = f"output/{version}_MELD_errors.csv"
return output_files, error_file
return output_files, error_file
# version_1 = "V1_0_0"
# version_1 = "V1_0_0"
version_1 = "V2_1_4"
version_1 = "V2_1_4"
version_2 = "V2_2_3"
version_2 = "V2_2_3"
output1, err1 = get_output_files(version_1)
output1, err1 = get_output_files(version_1)
output2, err2 = get_output_files(version_2)
output2, err2 = get_output_files(version_2)
print("## Compare Concepts", version_1, "to", version_2)
print("## Compare Concepts", version_1, "to", version_2)
for out1, out2 in zip(output1, output2):
for out1, out2 in zip(output1, output2):
print(out1, out2 )
print(out1, out2 )
df1 = pd.read_csv(out1)
df1 = pd.read_csv(out1)
df1 = df1[["code","MELDB_concept"]].groupby("MELDB_concept").count()
df1 = df1[["code","MELDB_concept"]].groupby("MELDB_concept").count()
df2 = pd.read_csv(out2)
df2 = pd.read_csv(out2)
df2 = df2[["code","MELDB_concept"]].groupby("MELDB_concept").count()
df2 = df2[["code","MELDB_concept"]].groupby("MELDB_concept").count()
#Added/Removed Concepts
#Added/Removed Concepts
print("- Removed Concepts", list(set(df1.index) - set(df2.index)))
print("- Removed Concepts", list(set(df1.index) - set(df2.index)))
print("- Added Concepts", list(set(df2.index) - set(df1.index)))
print("- Added Concepts", list(set(df2.index) - set(df1.index)))
#Changed Concepts
#Changed Concepts
diff = df2 - df1 #diff in counts
diff = df2 - df1 #diff in counts
diff = diff[(~(diff["code"] == 0.0)) & diff["code"].notna()] #get non-zero counts
diff = diff[(~(diff["code"] == 0.0)) & diff["code"].notna()] #get non-zero counts
s = "\n"
s = "\n"
for concept, row in diff.iterrows():
for concept, row in diff.iterrows():
s += "\t - {} {}\n".format(concept, row["code"])
s += "\t - {} {}\n".format(concept, row["code"])
print("- Changed Concepts", s)
print("- Changed Concepts", s)
# for output_file in output_files:
# for output_file in output_files:
# print("---"*3,output_file,"---"*3,)
# print("---"*3,output_file,"---"*3,)
# df = pd.read_csv(output_file)
# df = pd.read_csv(output_file)
# # df["MELDB_concept"].loc[df["CONCEPT TYPE"].isna()]
# # df["MELDB_concept"].loc[df["CONCEPT TYPE"].isna()]
# print("MELDB missing concepts ", len(df[df["CONCEPT TYPE"].isna()]))
# print("MELDB missing concepts ", len(df[df["CONCEPT TYPE"].isna()]))
# if df["code"].dtype == "object":
# if df["code"].dtype == "object":
# print("Chars present:", np.sort(df["code"].apply(lambda x : set(x)).explode().unique()))
# print("Chars present:", np.sort(df["code"].apply(lambda x : set(x)).explode().unique()))
```
```
%% Output
%% Cell type:code id:cc60c137-5a85-4155-af6b-6796f8c05980 tags:
```
python
import glob
import os
import pandas as pd
df = pd.read_csv("/home/jjd1c23/ssd/meldb/jjd1c23/concepts/PHEN_summary_working.csv")
df = df.set_index("#")
## Compare Concepts V2_1_4 to V2_2_3
for vocab in ["atc", "icd10", "readv2", "snomed"]:
output/V2_1_4_MELD_concepts_readv2.csv output/V2_2_3_MELD_concepts_readv2.csv
df[vocab.upper()] = ""
- Removed Concepts ['THYROID_DISEASE', 'SCHIZOPHRENIA_BIPOLAR_DISORDER', 'PSIORIASIS_ECZEMA', 'HAEMATOLOGICAL_CANCERS', 'INFLAMM_ARTHROPATHIES', 'ALL_CANCER', 'STROKE_TIA', 'DIABETES', 'PMR_AND_GCD', 'LONG_TERM_MS_PROBLEMS', 'ALL_CKD', 'INFLAMM_ARTHROPATHIES_CONNECTIVE_TISSUE_DIS', 'RENAL_TRANSPLANT_DIALYSIS']
- Added Concepts []
for file in glob.glob(f"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/{vocab}/*.csv"):
- Changed Concepts
concept_set = os.path.basename(file)[:-4]
- ANXIETY -7.0
row_index = df[df["CONCEPT NAME "] == concept_set].index[0]
- ARRHYTHMIA -1.0
- ASTHMA -1.0
df.loc[row_index, vocab.upper()] = "YES"
- AUTISM_AND_ADHD -4.0
- BIPOLAR_DISORDER -1.0
df = df.drop(columns=["READv2_CODE", "ICD10_CODE"])
- BLINDNESS_AND_LOW_VISION -3.0
df.to_csv("/home/jjd1c23/ssd/meldb/jjd1c23/concepts/PHEN_summary_working_labelled.csv")
- COELIAC_DISEASE -1.0
```
- CORONARY_HEART_DISEASE -8.0
- DEAFNESS -33.0
- DEMENTIA_ALZHEIMER -2.0
- DEPRESSION -5.0
- DIABETES_T1 -1.0
- DIABETES_T2 -1.0
- DIALYSIS -14.0
- DIVERTICULAR_DISEASE -11.0
- DRUG_ALCOHOL_MISUSE -3.0
- EATING_DISORDERS -2.0
- EPILEPSY -1.0
- FATIGUE -27.0
- HEADACHE -48.0
- HF -3.0
- INCONTINENCE -21.0
- LEARNING_DISABILITY -3.0
- MSK_PAIN -36.0
- MULTIPLE_SCLEROSIS -1.0
- PALLIATIVE_CARE -8.0
- PLASMACELL -1.0
- PTSD -1.0
- SCHIZOPHRENIA -1.0
- SELF_HARM -37.0
- SLEEP_PROBLEMS -74.0
- STRESS -31.0
- SYSTEMIC_LUPUS_ERYTHEMATOSUS -2.0
output/V2_1_4_MELD_snomed_no_translate.csv output/V2_2_3_MELD_snomed_no_translate.csv
- Removed Concepts ['THYROID_DISEASE', 'SCHIZOPHRENIA_BIPOLAR_DISORDER', 'PSIORIASIS_ECZEMA', 'HAEMATOLOGICAL_CANCERS', 'INFLAMM_ARTHROPATHIES', 'ALL_CANCER', 'STROKE_TIA', 'DIABETES', 'PMR_AND_GCD', 'LONG_TERM_MS_PROBLEMS', 'ALL_CKD', 'INFLAMM_ARTHROPATHIES_CONNECTIVE_TISSUE_DIS', 'RENAL_TRANSPLANT_DIALYSIS']
- Added Concepts []
- Changed Concepts
- ANAEMIA -2.0
- ANEURYSM -3.0
- ANXIETY -7.0
- ARRHYTHMIA -25.0
- ASTHMA -34.0
- ATOPIC_ECZEMA -6.0
- AUTISM_AND_ADHD -2.0
- BIPOLAR_DISORDER -3.0
- BLINDNESS_AND_LOW_VISION -4.0
- BREAST_CANCER -2.0
- BRONCHIECSTASIS -1.0
- CHRONIC_BACK_PAIN -1.0
- CHRONIC_FATIGUE_SYNDROME -3.0
- CHRONIC_LIVER_DISEASE -14.0
- CHRONIC_PAIN -2.0
- CKD_STAGE3_5 -3.0
- COELIAC_DISEASE -6.0
- COLON_CANCER -6.0
- CONGENITAL_DIS_CHROMOSOMAL_ABNORMALITIES -1.0
- COPD -31.0
- CORONARY_HEART_DISEASE -21.0
- CYSTIC_FIBROSIS -24.0
- DEAFNESS -15.0
- DEMENTIA_ALZHEIMER -111.0
- DEPRESSION -34.0
- DIABETES_T2 -2.0
- DIABETIC_RETINOPATHY -13.0
- DIALYSIS -1.0
- DIVERTICULAR_DISEASE -4.0
- DRUG_ALCOHOL_MISUSE -310.0
- EATING_DISORDERS -4.0
- ENDOMETRIOSIS -1.0
- EPILEPSY -11.0
- GLAUCOMA -3.0
- GOUT -4.0
- HEART_VALVE_DISORDERS -6.0
- HF -4.0
- HIVAIDS -18.0
- HYPERTENSION -11.0
- HYPERTHYROIDISM -1.0
- HYPOTHYROIDISM -8.0
- IBD -2.0
- ILD -2.0
- LEARNING_DISABILITY -40.0
- LEUKAEMIA -1.0
- LYMPHOMA -2.0
- MENIERES_DISEASE -1.0
- METASTATIC_CANCER -3.0
- MOBILITY_PROBLEMS -45.0
- MULTIPLE_SCLEROSIS -13.0
- OBESITY -63.0
- OSTEOARTHRITIS -3.0
- OSTEOPOROSIS -4.0
- PARALYSIS -3.0
- PARKINSONS -2.0
- PLASMACELL -1.0
- PROSTATE_CANCER -2.0
- PROSTATE_DISORDERS -2.0
- PSORIASIS -3.0
- PTSD -38.0
- RENAL_TRANSPLANT -1.0
- RHEUMATOID_ARTHRITIS -8.0
- SCHIZOPHRENIA -85.0
- SKIN_CANCER -4.0
- STROKE -4.0
- SYSTEMIC_LUPUS_ERYTHEMATOSUS -1.0
- TIA -1.0
- VIRAL_HEPATITIS -9.0
- VTD -5.0
output/V2_1_4_MELD_icd10_no_translate.csv output/V2_2_3_MELD_icd10_no_translate.csv
- Removed Concepts ['THYROID_DISEASE', 'SCHIZOPHRENIA_BIPOLAR_DISORDER', 'PSIORIASIS_ECZEMA', 'HAEMATOLOGICAL_CANCERS', 'INFLAMM_ARTHROPATHIES', 'ALL_CANCER', 'STROKE_TIA', 'DIABETES', 'PMR_AND_GCD', 'LONG_TERM_MS_PROBLEMS', 'ALL_CKD', 'INFLAMM_ARTHROPATHIES_CONNECTIVE_TISSUE_DIS']
- Added Concepts []
- Changed Concepts
- CVD_EVENTS -1.0
output/V2_1_4_MELD_atc_no_translate.csv output/V2_2_3_MELD_atc_no_translate.csv
- Removed Concepts []
- Added Concepts []
- Changed Concepts
%% Cell type:markdown id:e5c4291f-847b-4c82-976e-bd5b3a7b6bcc tags:
%% Cell type:markdown id:e5c4291f-847b-4c82-976e-bd5b3a7b6bcc tags:
### Mappings
### Mappings
%% Cell type:code id:08e34750-413c-469e-bcb8-e71bb188ff42 tags:
%% Cell type:code id:08e34750-413c-469e-bcb8-e71bb188ff42 tags:
```
python
```
python
#NHS Read Browser
#NHS Read Browser
import simpledbf
import simpledbf
import pandas as pd
import pandas as pd
#r2 only
#r2 only
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ANCESTOR.DBF').to_dataframe()
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ANCESTOR.DBF').to_dataframe()
df = pd.concat([df['READCODE'], df['DESCENDANT']])
df = pd.concat([df['READCODE'], df['DESCENDANT']])
df = pd.DataFrame(df.drop_duplicates())
df = pd.DataFrame(df.drop_duplicates())
df = df.rename(columns={0:"read2_code"})
df = df.rename(columns={0:"read2_code"})
df.to_parquet("maps/processed/read2_code.parquet", index=False)
df.to_parquet("maps/processed/read2_code.parquet", index=False)
#r2 -> atc
#r2 -> atc
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ATC.DBF').to_dataframe()
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ATC.DBF').to_dataframe()
df = df[["READCODE", "ATC"]]
df = df[["READCODE", "ATC"]]
df = df.rename(columns={"READCODE":"read2_code", "ATC":"atc_code"})
df = df.rename(columns={"READCODE":"read2_code", "ATC":"atc_code"})
df.to_parquet("maps/processed/read2_code_to_atc_code.parquet", index=False)
df.to_parquet("maps/processed/read2_code_to_atc_code.parquet", index=False)
#r2 -> icd10
#r2 -> icd10
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ICD10.DBF').to_dataframe()
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ICD10.DBF').to_dataframe()
df = df[["READ_CODE", "TARG_CODE"]]
df = df[["READ_CODE", "TARG_CODE"]]
df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"icd10_code"})
df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"icd10_code"})
df = df[~df["icd10_code"].str.match("^.
*-.*
$")] #remove codes with '-'
df = df[~df["icd10_code"].str.match("^.
*-.*
$")] #remove codes with '-'
df = df[~df["read2_code"].str.match("^.
*-.*
$")] #remove codes with '-'
df = df[~df["read2_code"].str.match("^.
*-.*
$")] #remove codes with '-'
df.to_parquet("maps/processed/read2_code_to_icd10_code.parquet", index=False)
df.to_parquet("maps/processed/read2_code_to_icd10_code.parquet", index=False)
#r2 -> opcs4
#r2 -> opcs4
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/OPCS4V3.DBF').to_dataframe()
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/OPCS4V3.DBF').to_dataframe()
df = df[["READ_CODE", "TARG_CODE"]]
df = df[["READ_CODE", "TARG_CODE"]]
df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"opcs4_code"})
df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"opcs4_code"})
df = df[~df["opcs4_code"].str.match("^.
*-.*
$")] #remove codes with '-'
df = df[~df["opcs4_code"].str.match("^.
*-.*
$")] #remove codes with '-'
df = df[~df["read2_code"].str.match("^.
*-.*
$")] #remove codes with '-'
df = df[~df["read2_code"].str.match("^.
*-.*
$")] #remove codes with '-'
df.to_parquet("maps/processed/read2_code_to_opcs4_code.parquet", index=False)
df.to_parquet("maps/processed/read2_code_to_opcs4_code.parquet", index=False)
#r3 only
#r3 only
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ANCESTOR.DBF').to_dataframe()
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ANCESTOR.DBF').to_dataframe()
df = pd.concat([df['READCODE'], df['DESCENDANT']])
df = pd.concat([df['READCODE'], df['DESCENDANT']])
df = pd.DataFrame(df.drop_duplicates())
df = pd.DataFrame(df.drop_duplicates())
df = df.rename(columns={0:"read3_code"})
df = df.rename(columns={0:"read3_code"})
df.to_parquet("maps/processed/read3_code.parquet", index=False)
df.to_parquet("maps/processed/read3_code.parquet", index=False)
#r3 -> icd10
#r3 -> icd10
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ICD10.DBF').to_dataframe()
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ICD10.DBF').to_dataframe()
df = df[["READ_CODE", "TARG_CODE"]]
df = df[["READ_CODE", "TARG_CODE"]]
df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"icd10_code"})
df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"icd10_code"})
df = df[~df["icd10_code"].str.match("^.
*-.*
$")] #remove codes with '-'
df = df[~df["icd10_code"].str.match("^.
*-.*
$")] #remove codes with '-'
df = df[~df["read3_code"].str.match("^.
*-.*
$")] #remove codes with '-'
df = df[~df["read3_code"].str.match("^.
*-.*
$")] #remove codes with '-'
df.to_parquet("maps/processed/read3_code_to_icd10_code.parquet", index=False)
df.to_parquet("maps/processed/read3_code_to_icd10_code.parquet", index=False)
#r3 -> icd9
#r3 -> icd9
# dbf = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ICD9V3.DBF')
# dbf = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ICD9V3.DBF')
#r3 -> opcs4
#r3 -> opcs4
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/OPCS4V3.DBF').to_dataframe()
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/OPCS4V3.DBF').to_dataframe()
df = df[["READ_CODE", "TARG_CODE"]]
df = df[["READ_CODE", "TARG_CODE"]]
df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"opcs4_code"})
df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"opcs4_code"})
df = df[~df["opcs4_code"].str.match("^.
*-.*
$")] #remove codes with '-'
df = df[~df["opcs4_code"].str.match("^.
*-.*
$")] #remove codes with '-'
df = df[~df["read3_code"].str.match("^.
*-.*
$")] #remove codes with '-'
df = df[~df["read3_code"].str.match("^.
*-.*
$")] #remove codes with '-'
df.to_parquet("maps/processed/read3_code_to_opcs4_code.parquet", index=False)
df.to_parquet("maps/processed/read3_code_to_opcs4_code.parquet", index=False)
```
```
%% Cell type:code id:5fe95638-1f25-45f3-803c-2fff74a2a4fd tags:
%% Cell type:code id:5fe95638-1f25-45f3-803c-2fff74a2a4fd tags:
```
python
```
python
#NHS Data Migrations
#NHS Data Migrations
#r2 only
#r2 only
# df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rctcremap_uk_20200401000001.txt', sep='\t')
# df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rctcremap_uk_20200401000001.txt', sep='\t')
#r3 only
#r3 only
# df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3cremap_uk_20200401000001.txt', sep='\t')
# df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3cremap_uk_20200401000001.txt', sep='\t')
#snomed only
#snomed only
df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/sctcremap_uk_20200401000001.txt', sep='
\t
')
df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/sctcremap_uk_20200401000001.txt', sep='
\t
')
df = df[["SCT_CONCEPTID"]]
df = df[["SCT_CONCEPTID"]]
df = df.rename(columns={"SCT_CONCEPTID":"snomed_code"})
df = df.rename(columns={"SCT_CONCEPTID":"snomed_code"})
df = df.drop_duplicates()
df = df.drop_duplicates()
df = df.astype(str)
df = df.astype(str)
df.to_parquet("maps/processed/snomed_code.parquet", index=False)
df.to_parquet("maps/processed/snomed_code.parquet", index=False)
#r2 -> r3
#r2 -> r3
df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rctctv3map_uk_20200401000001.txt', sep='
\t
')
df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rctctv3map_uk_20200401000001.txt', sep='
\t
')
df = df[["V2_CONCEPTID", "CTV3_CONCEPTID"]]
df = df[["V2_CONCEPTID", "CTV3_CONCEPTID"]]
df = df.rename(columns={"V2_CONCEPTID":"read2_code",
df = df.rename(columns={"V2_CONCEPTID":"read2_code",
"CTV3_CONCEPTID":"read3_code"})
"CTV3_CONCEPTID":"read3_code"})
df.to_parquet("maps/processed/read2_code_to_read3_code.parquet", index=False)
df.to_parquet("maps/processed/read2_code_to_read3_code.parquet", index=False)
#r3->r2
#r3->r2
df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3rctmap_uk_20200401000002.txt', sep='
\t
')
df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3rctmap_uk_20200401000002.txt', sep='
\t
')
df = df[["CTV3_CONCEPTID", "V2_CONCEPTID"]]
df = df[["CTV3_CONCEPTID", "V2_CONCEPTID"]]
df = df.rename(columns={"CTV3_CONCEPTID":"read3_code",
df = df.rename(columns={"CTV3_CONCEPTID":"read3_code",
"V2_CONCEPTID":"read2_code"})
"V2_CONCEPTID":"read2_code"})
df = df.drop_duplicates()
df = df.drop_duplicates()
df = df[~df["read2_code"].str.match("^.
*_.*
$")] #remove r2 codes with '_'
df = df[~df["read2_code"].str.match("^.
*_.*
$")] #remove r2 codes with '_'
df.to_parquet("maps/processed/read3_code_to_read2_code.parquet", index=False)
df.to_parquet("maps/processed/read3_code_to_read2_code.parquet", index=False)
#r2 -> snomed
#r2 -> snomed
df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rcsctmap2_uk_20200401000001.txt', sep='
\t
', dtype=str)
df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rcsctmap2_uk_20200401000001.txt', sep='
\t
', dtype=str)
df = df[["ReadCode", "ConceptId"]]
df = df[["ReadCode", "ConceptId"]]
df = df.rename(columns={"ReadCode":"read2_code",
df = df.rename(columns={"ReadCode":"read2_code",
"ConceptId":"snomed_code"})
"ConceptId":"snomed_code"})
df.to_parquet("maps/processed/read2_code_to_snomed_code.parquet", index=False)
df.to_parquet("maps/processed/read2_code_to_snomed_code.parquet", index=False)
#r3->snomed
#r3->snomed
df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3sctmap2_uk_20200401000001.txt', sep='
\t
')
df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3sctmap2_uk_20200401000001.txt', sep='
\t
')
df = df[["CTV3_TERMID", "SCT_CONCEPTID"]]
df = df[["CTV3_TERMID", "SCT_CONCEPTID"]]
df = df.rename(columns={"CTV3_TERMID":"read3_code",
df = df.rename(columns={"CTV3_TERMID":"read3_code",
"SCT_CONCEPTID":"snomed_code"})
"SCT_CONCEPTID":"snomed_code"})
df["snomed_code"] = df["snomed_code"].astype(str)
df["snomed_code"] = df["snomed_code"].astype(str)
df = df[~df["snomed_code"].str.match("^.
*_.*
$")] #remove snomed codes with '_'
df = df[~df["snomed_code"].str.match("^.
*_.*
$")] #remove snomed codes with '_'
df.to_parquet("maps/processed/read3_code_to_snomed_code.parquet", index=False)
df.to_parquet("maps/processed/read3_code_to_snomed_code.parquet", index=False)
```
```
%% Cell type:code id:267fa1cc-5159-48c4-9eee-19af5039d627 tags:
%% Cell type:code id:267fa1cc-5159-48c4-9eee-19af5039d627 tags:
```
python
```
python
#OPCS410 Data Files
#OPCS410 Data Files
df = pd.read_csv("maps/OPCS410 Data files txt/OPCS410 CodesAndTitles Nov 2022 V1.0.txt", sep='
\t
', dtype=str, header=None)
df = pd.read_csv("maps/OPCS410 Data files txt/OPCS410 CodesAndTitles Nov 2022 V1.0.txt", sep='
\t
', dtype=str, header=None)
df = df.rename(columns={0:"opcs4_code", 1:"description"})
df = df.rename(columns={0:"opcs4_code", 1:"description"})
df.to_parquet("maps/processed/opcs4_code.parquet", index=False)
df.to_parquet("maps/processed/opcs4_code.parquet", index=False)
```
```
%% Cell type:code id:01d046fd-69af-44f3-acad-5d0edef3f745 tags:
%% Cell type:code id:01d046fd-69af-44f3-acad-5d0edef3f745 tags:
```
python
```
python
#ICD10_edition5
#ICD10_edition5
df = pd.read_xml("maps/ICD10_Edition5_XML_20160401/Content/ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml",)
df = pd.read_xml("maps/ICD10_Edition5_XML_20160401/Content/ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml",)
df = df[["CODE", "ALT_CODE", "DESCRIPTION"]]
df = df[["CODE", "ALT_CODE", "DESCRIPTION"]]
df = df.rename(columns={"CODE":"icd10_code",
df = df.rename(columns={"CODE":"icd10_code",
"ALT_CODE":"icd10_alt_code",
"ALT_CODE":"icd10_alt_code",
"DESCRIPTION":"description"
"DESCRIPTION":"description"
})
})
df.to_parquet("maps/processed/icd10_code.parquet", index=False)
df.to_parquet("maps/processed/icd10_code.parquet", index=False)
```
```
%% Cell type:code id:36630e24-f56c-48e1-8ecf-4ccd2b41eaea tags:
%% Cell type:code id:36630e24-f56c-48e1-8ecf-4ccd2b41eaea tags:
```
python
```
python
code1="read2_code"
code1="read2_code"
code2="icd10_code"
code2="icd10_code"
df_map = pd.read_parquet(f"maps/processed/{code1}_to_{code2}.parquet")
df_map = pd.read_parquet(f"maps/processed/{code1}_to_{code2}.parquet")
codes=df_map["read2_code"].iloc[:5]
codes=df_map["read2_code"].iloc[:5]
pd.merge(codes, df_map, how='left')[code2]
pd.merge(codes, df_map, how='left')[code2]
```
```
%% Cell type:code id:9787adeb-8507-488b-9a91-b8df3fbbe21e tags:
%% Cell type:code id:9787adeb-8507-488b-9a91-b8df3fbbe21e tags:
```
python
```
python
#CPRD Code Browser
#CPRD Code Browser
df = pd.read_csv('maps/CPRD_CodeBrowser_202211_Aurum/CPRDAurumMedical.txt', sep='
\t
')
df = pd.read_csv('maps/CPRD_CodeBrowser_202211_Aurum/CPRDAurumMedical.txt', sep='
\t
')
df = df[["MedCodeId", "CleansedReadCode", "SnomedCTConceptId"]]
df = df[["MedCodeId", "CleansedReadCode", "SnomedCTConceptId"]]
df = df.rename(columns={"MedCodeId":"med_code",
df = df.rename(columns={"MedCodeId":"med_code",
"CleansedReadCode":"read2_code",
"CleansedReadCode":"read2_code",
"SnomedCTConceptId":"snomed_code"})
"SnomedCTConceptId":"snomed_code"})
# df = pd.read_csv('maps/CPRD_CodeBrowser_202211_Aurum/CPRDAurumProduct.txt', sep='\t', dtype=str)
# df = pd.read_csv('maps/CPRD_CodeBrowser_202211_Aurum/CPRDAurumProduct.txt', sep='\t', dtype=str)
# df = pd.read_csv('maps/CPRD_CodeBrowser_202211_GOLD/medical.txt', sep='\t')
# df = pd.read_csv('maps/CPRD_CodeBrowser_202211_GOLD/medical.txt', sep='\t')
# df = df.reset_index().iloc[:,[1,6]]
# df = df.reset_index().iloc[:,[1,6]]
# df = df.rename(columns={"level_1":"read2_code", "20220523":"description"})
# df = df.rename(columns={"level_1":"read2_code", "20220523":"description"})
# df = pd.read_csv('maps/CPRD_CodeBrowser_202211_GOLD/product.txt', sep='\t', dtype=str) #CANNOT OPEN
# df = pd.read_csv('maps/CPRD_CodeBrowser_202211_GOLD/product.txt', sep='\t', dtype=str) #CANNOT OPEN
df
df
```
```
%% Cell type:code id:a968ffb1-4337-456b-8d20-419888b4044f tags:
%% Cell type:code id:a968ffb1-4337-456b-8d20-419888b4044f tags:
```
python
```
python
#BNF
#BNF
df = pd.read_excel("maps/BNF Snomed Mapping data 20231215.xlsx")
df = pd.read_excel("maps/BNF Snomed Mapping data 20231215.xlsx")
df = df.astype(str)
df = df.astype(str)
df = df.rename(columns={"BNF Code":"bnf_code",
df = df.rename(columns={"BNF Code":"bnf_code",
"SNOMED Code":"snomed_code"})
"SNOMED Code":"snomed_code"})
df[["bnf_code", "snomed_code"]].to_parquet("maps/processed/bnf_code_to_snomed_code.parquet", index=False)
df[["bnf_code", "snomed_code"]].to_parquet("maps/processed/bnf_code_to_snomed_code.parquet", index=False)
```
```
%% Cell type:code id:c70b1ce2-0f41-4d02-ad17-6fc44bc3c6bf tags:
%% Cell type:code id:c70b1ce2-0f41-4d02-ad17-6fc44bc3c6bf tags:
```
python
```
python
#BNF to Readv2 Merge
#BNF to Readv2 Merge
df1 = pd.read_parquet("maps/processed/bnf_code_to_snomed_code.parquet").astype(str)
df1 = pd.read_parquet("maps/processed/bnf_code_to_snomed_code.parquet").astype(str)
df2 = pd.read_parquet("maps/processed/read2_code_to_snomed_code.parquet").astype(str)
df2 = pd.read_parquet("maps/processed/read2_code_to_snomed_code.parquet").astype(str)
# df1.merge(df2, how="inner", on="snomed_code")
# df1.merge(df2, how="inner", on="snomed_code")
```
```
%% Cell type:code id:d5d34237-02d4-4dea-8c20-5adaf337f6b5 tags:
%% Cell type:code id:d5d34237-02d4-4dea-8c20-5adaf337f6b5 tags:
```
python
```
python
df1.merge(df2, how='inner', on='snomed_code')
df1.merge(df2, how='inner', on='snomed_code')
```
```
%%
Output
%%
Cell type:code id:b3166cf0-e4a5-43e0-aeac-78827427422e tags:
Empty DataFrame
```
python
Columns: [bnf_code, snomed_code, read2_code]
.astype(str).dtypes
Index: []
```
%% Cell type:code id:
d0cbadfe-ef55-40a8-a0f1-a9fc69d7456b
tags:
%% Cell type:code id:
c0a766f9-7959-4a10-b58f-cd946a878b60
tags:
```
python
```
python
df = pd.read_csv("../concepts/PHEN_summary_working.csv")
cols = list(df.columns)
cols.remove('CONCEPT NAME ')
cols.remove('AGREED')
df = df.applymap(lambda x: str(x) if isinstance(x, (int, float)) else x) #change to int
df_copy = df.rename(columns={
"CONCEPT NAME ":"concept_set_name",
"AGREED":"concept_set_status"
})
df_copy["concept_set_status"] = df_copy["concept_set_status"].replace("USE", "AGREED")
df_copy = df_copy[["concept_set_name", "concept_set_status"]]
outs = df_copy.to_dict(orient='records')
for i, out in enumerate(outs):
out["metadata"] = dict(df[cols].iloc[i])
json.dumps(outs)
```
```
%%
Output
%%
Cell type:code id:8a204a95-dc4c-4183-9ea7-f5c5e95e9087 tags:
Empty DataFrame
```
python
Columns: [bnf_code, snomed_code, read2_code]
```
Index: []
%% Cell type:code id:
b3166cf0-e4a5-43e0-aeac-78827427422e
tags:
%% Cell type:code id:
5ce1ab58-50b4-4c22-b72b-c698de6830f7
tags:
```
python
```
python
.astype(str).dtypes
import json
```
```
%% Output
%% Cell type:code id:f1ea81c6-d1db-408f-9d3a-b96f44efe21f tags:
```
python
```
%% Cell type:markdown id:5eb544a3-9dd1-41e8-88c2-a808646c6112 tags:
### OMOP Database
%% Cell type:code id:c9e58e62-9e44-4d0c-9d8d-35c175c07e6c tags:
```
python
import sqlite3
import csv
import pandas as pd
import os
```
%% Cell type:code id:4f67c9a1-373f-4799-8a85-72767662d912 tags:
```
python
```
%% Cell type:code id:d0ecdf69-ee90-42c1-ad25-d8357b603d1b tags:
```
python
#IMPORT OMOP VOCABS
conn = sqlite3.connect("codes/omop_54.sqlite") # change to 'sqlite:///your_filename.db'
folder_path = "codes/vocabulary_download_v5_{9424944c-2b76-4127-8f05-f535e0f15e2a}_1731661390540"
# Check if the folder exists
if not os.path.isdir(folder_path):
raise Exception(f"Error: The folder '{folder_path}' does not exist.")
# Iterate through files in the folder
for filename in os.listdir(folder_path):
if filename.endswith(".csv"): # Check if the file is a CSV
file_path = os.path.join(folder_path, filename)
try:
print(f"Reading file: {file_path}")
# Read the CSV file with the specified delimiter
df = pd.read_csv(file_path, delimiter="
\t
", low_memory=False)
table_name = os.path.splitext(os.path.basename(file_path))[0] #Get name of file
#Export Table to sqlite db
df.to_sql(table_name, conn, if_exists='replace', index=False)
except Exception as e:
raise Exception(f"Error reading file {file_path}: {e}")
conn.commit()
conn.close()
```
%% Cell type:code id:b9cafd0c-a3bd-408b-bca8-b0de2acde1cd tags:
```
python
# Create a SQL connection to our SQLite database
conn = sqlite3.connect("codes/omop_54.sqlite")
cur = conn.cursor()
#Print ALL Columns in Table
# table="CONCEPT_SET"
# cur.execute(f"PRAGMA table_info({table});")
# print(pd.DataFrame(cur.fetchall()))
#Print ALL TABLE NAMES
# cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=? ;", ("VOCABULARY",))
# print(cur.fetchone())
cur.execute("SELECT vocabulary_id FROM VOCABULARY WHERE vocabulary_id=? ;", ("MELDB",))
print(cur.fetchone())
#Print WHOLE TABLE
# cur.execute('SELECT * FROM CONCEPT;')
# cur.execute('SELECT * FROM CONCEPT WHERE standard_concept = "C";')
# cur.execute('SELECT * FROM CONCEPT WHERE concept_code = "119768002" LIMIT 1;')
# cur.execute('SELECT * FROM CONCEPT WHERE concept_code IN ("119768002", "5905001");')
# cur.execute('SELECT DISTINCT VOCABULARY_ID FROM CONCEPT;')
# df = pd.DataFrame(cur.fetchall())
# print(list(df[0]))
# display(df)
# for row in :
# print(row)
BNF Code object
#Get Header of Table
SNOMED Code object
# table="CONCEPT_CLASS"
dtype: object
# cur.execute(f"SELECT * FROM {table} LIMIT 3;")
# print(cur.fetchall())
#create meldb VOCABULARY
# meldb_version='v3.2.10'
# meldb_description = 'Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity'
# meldb_reference = 'https://www.it-innovation.soton.ac.uk/projects/meldb'
# df_test = pd.DataFrame([{
# "vocabulary_id": 'MELDB',
# "vocabulary_name": meldb_description,
# "vocabulary_reference": meldb_reference,
# "vocabulary_version": meldb_version,
# # "vocabulary_concept_id": 0,
# }])
# df_test.to_sql("VOCABULARY", conn, if_exists='append', index=False)
# cur.execute("""
# CREATE TABLE CONCEPT_SET (
# concept_set_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each concept set
# atlas_id INTEGER, -- Unique identifier generated by ATLAS
# concept_set_name TEXT, -- Optional name for the concept set
# concept_set_description TEXT, -- Optional description for the concept set
# vocabulary_id TEXT NOT NULL, -- Foreign key to VOCABULARY table
# FOREIGN KEY (vocabulary_id) REFERENCES VOCABULARY(vocabulary_id)
# );""")
# cur.execute("DROP TABLE CONCEPT_SET;")
# cur.execute("""
# CREATE TABLE CONCEPT_SET_ITEM (
# concept_set_item_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each mapping
# concept_set_id INTEGER NOT NULL, -- Foreign key to CONCEPT_SET table
# concept_id INTEGER NOT NULL, -- Foreign key to CONCEPT table
# FOREIGN KEY (concept_set_id) REFERENCES CONCEPT_SET(concept_set_id),
# FOREIGN KEY (concept_id) REFERENCES CONCEPT(concept_id)
# );""")
# cur.execute("DROP TABLE CONCEPT_SET_ITEM;")
# Be sure to close the connection
conn.close()
```
%% Cell type:code id:d03b75f3-902f-42d7-b52f-dac7e79ecb11 tags:
```
python
conn = sqlite3.connect("codes/omop_54.sqlite") # change to 'sqlite:///your_filename.db'
cur = conn.cursor()
file_path = "/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/HEART_VALVE_DISORDERS.csv"
df = pd.read_csv(file_path, low_memory=False)
df = df.set_index("code")
df.to_sql(name='test', con=conn, if_exists='replace')
conn.commit()
conn.close()
```
%% Cell type:code id:d96c3511-3831-400e-ba40-0a36abcc60d3 tags:
```
python
#DISPLAY SQL TABLE
table="CONCEPT_SET_ITEM"
# Create a SQL connection to our SQLite database
conn = sqlite3.connect("codes/omop_54.sqlite")
cur = conn.cursor()
#Print ALL Columns in Table
cur.execute(f"PRAGMA table_info({table});")
df_cols = pd.DataFrame(cur.fetchall())
print(df_cols)
df_cols = df_cols[1]
#Print TABLE
cur.execute(f"SELECT
*
FROM {table};")
df = pd.DataFrame(cur.fetchall())
df = df.rename(columns={i:s for i, s in enumerate(df_cols)})
display(df)
conn.close()
# a+s = 13364
# a+s+i = 13591
```
%% Cell type:code id:42d49a00-9646-4ba4-afb6-12297289b7a7 tags:
```
python
def sql_row_exist(conn, table, column, value):
# Execute and check if a result exists
cur = conn.cursor()
query = f"SELECT 1 FROM {table} WHERE {column} = ? LIMIT 1;"
cur.execute(query, (value,))
exists = cur.fetchone() is not None
return exists
```
%% Cell type:code id:f7b51bcd-6ee1-4023-8d36-7f419ce4120d tags:
```
python
#EXPORT MELDB CSV OUTPUT
conn = sqlite3.connect("codes/omop_54.sqlite") # change to 'sqlite:///your_filename.db'
cur = conn.cursor()
vocab_output = "MELDB"
vocab_type = "SNOMED"
file_path = "/home/jjd1c23/ssd/meldb/jjd1c23/phenotype/output/V3_2_10_MELD_snomed_no_translate.csv"
# file_path = "/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/HEART_VALVE_DISORDERS.csv"
# Read the CSV file with the specified delimiter
out = pd.read_csv(file_path, low_memory=False)
print(df.columns)
for concept_set_name, grp in out.groupby("MELDB_concept"):
# display(concept_set_name, grp[["code", "MELDB_concept"]])
#Create Concept_Set
if not sql_row_exist(conn, "CONCEPT_SET", "concept_set_name", concept_set_name):
cur.execute(f"INSERT INTO CONCEPT_SET (concept_set_name, vocabulary_id) VALUES ('{concept_set_name}', 'MELDB');")
else:
print("concept_set", concept_set_name, "already exists")
#TODO: ask to remove old concept_set?
#Get Concept_set_Id
query = "SELECT concept_set_id FROM CONCEPT_SET WHERE concept_set_name = ? AND vocabulary_id = ?;"
cur.execute(query, (concept_set_name, vocab_output, ))
concept_set_id = cur.fetchone()[0]
#Get corresponing Concept_id (OMOP) for each Concept_code (e.g. SNOMED)
concept_codes = "'"+"', '".join(list(grp["code"].astype(str)))+"'"
query = f"SELECT concept_id FROM CONCEPT WHERE vocabulary_id = ? AND concept_code IN ({concept_codes});"
print(query)
cur.execute(query, (vocab_type, ))
df_out = pd.DataFrame(cur.fetchall(), columns=["concept_id"])
if not len(grp) == len(df_out):
print("ERROR: Some", vocab_type, "Codes do not exist in OMOP Database")
#Create Concept_set_item
df_out["concept_set_id"] = concept_set_id
df_out.to_sql("CONCEPT_SET_ITEM", conn, if_exists='append', index=False)
display(df_out)
# break
# #Create New CONCEPT_SET
# table_name = os.path.splitext(os.path.basename(file_path))[0] #Get name of file
# cur.execute(f"INSERT INTO CONCEPT_SET (concept_class_name) VALUES ('{table_name}');")
conn.commit()
conn.close()
```
%% Cell type:code id:85007741-e34c-4112-a63c-9fb302b76958 tags:
```
python
"'"+"', '".join(list(grp["code"].astype(str)))+"'"
```
%% Cell type:markdown id:423e7c21-f3bd-439d-9dcb-c17cc2cc6854 tags:
### ATLAS
%% Cell type:code id:c6b45e4d-c7d2-42e7-9b4a-0e9c1c86d34b tags:
```
python
#Create ATLAS Concept Set
def atlas_create_concept(name, description="", items=[]):
data={
"id": 0,
"name": name,
"description": description,
"expression": {
"items":items
}
}
try:
# Sending the POST request
response = requests.post(url, json=data, headers=headers)
# Check the response status
if response.status_code == 200 or response.status_code == 201:
print("POST request successful:")
print(response.json()) # Assuming the response is JSON
return response["id"]
else:
print(f"POST request failed. HTTP Status Code: {response.status_code}")
print("Response content:")
print(response.text)
return None
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
# Heart Test 1 - 1885487
# Heart Test 2 - 1885488
# Heart Valve Disorders - 1885449
```
%% Cell type:code id:45497623-1da0-4f74-b21e-da8811c89b04 tags:
```
python
def get_omop_concepts(cur, codes, vocab_id):
#Create List for SQL
mask = ""
for c in codes:
mask+=f'"{c}", '
mask = mask[:-2] #remove last comma
#Execute SQL
cur.execute(f'SELECT * FROM CONCEPT WHERE concept_code IN ({mask}) AND VOCABULARY_ID = "{vocab_id}";')
df = pd.DataFrame(cur.fetchall()) #convert to pandas df
print("Identified", len(df[0]) ,"OMOP Concepts:", list(df[0]))
return df
def omop_concepts_to_atlas_json(df):
json = []
for i, row in df.iterrows():
#template for atlas api
out = {
"concept": {
'CONCEPT_ID': row[0],
'CONCEPT_NAME': row[1],
'STANDARD_CONCEPT': 'S',
'STANDARD_CONCEPT_CAPTION': 'Standard',
'INVALID_REASON': 'V',
'INVALID_REASON_CAPTION': 'Valid',
'CONCEPT_CODE': row[6],
'DOMAIN_ID': row[2],
'VOCABULARY_ID': row[3],
'CONCEPT_CLASS_ID': row[4],
'VALID_START_DATE': int(row[7]),
'VALID_END_DATE': int(row[8])
},
'isExcluded': False,
'includeDescendants': False,
'includeMapped': False
}
json.append(out)
return json
conn = sqlite3.connect("codes/omop_54.sqlite")
cur = conn.cursor()
vocab_id="SNOMED" #SNOMED, ATC, ICD10CM, ICD9CM, Read
csv_output = "/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/ANGER.csv"
#Load CSV Output File
df_in = pd.read_csv(csv_output)
print(len(df_in))
# df = get_omop_concepts(cur, ["119768002", "5905001"], "SNOMED")
df = get_omop_concepts(cur, list(df_in["code"]), vocab_id)
json = omop_concepts_to_atlas_json(df)
# display(json)
conn.close()
```
%% Cell type:code id:ea759907-c085-472a-82e2-07b6b19e2c8f tags:
```
python
#ATLAS GET CONCEPT SET
import requests
def request_get(url):
try:
# Sending the GET request
response = requests.get(url)
# Check if the response status code is 200 (OK)
if response.status_code == 200:
print("Response data:")
# print(response.json()) # Assuming the response is in JSON format
return response.json()
else:
print(f"Failed to fetch data. HTTP Status Code: {response.status_code}")
print("Response content:")
print(response.text)
return None
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
#GET SET INFO
set_id = "1885449"
url = f"https://atlas-demo.ohdsi.org/WebAPI/conceptset/{set_id}"
request_get(url)
```
%% Cell type:code id:5a70e636-6051-4930-bf1b-30d093fd0552 tags:
```
python
#GET SET ITEMS (Concepts)
set_id = "1885449"
url = f"https://atlas-demo.ohdsi.org/WebAPI/conceptset/{set_id}/expression/ATLASPROD"
response = request_get(url)
display(response)
```
%% Cell type:code id:96bfcd9c-27e8-4be4-a680-7553d908790e tags:
```
python
#ATLAS CREATE CONCEPT SET
```
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment