move summary excel into json config

8f8b5119 · Jakub Dylag · 28f44a19 · 8f8b5119 · 8f8b5119
Commit 8f8b5119 authored 5 months ago by Jakub Dylag
--- a/main.py
+++ b/main.py
@@ -255,23 +255,14 @@ def run_all(mapping_file, target_code_type,
 	out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
 	out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"])
-	#Merge with Concept Types in Summary Excel File
+	#Add Concept Set Defintions metadata
-	if "excel_sheet" in summary_config:
+	summary_df = pd.DataFrame(summary_config["concept_set"]) #transform to dataframe
-		summary_df = read_table_file(summary_config["file"], excel_sheet=summary_config["excel_sheet"])
+	if "metadata" in summary_df.columns:
-	else:
+		summary_df = summary_df.join(pd.json_normalize(summary_df["metadata"])) #metadata to columns
-		summary_df = read_table_file(summary_config["file"])
+		summary_df = summary_df.drop(columns=["metadata"])
-	summary_cols_all = [] #get all column names
+	summary_df = summary_df.rename(columns={"concept_set_name":"CONCEPT_SET"})
-	for v in summary_config["columns"].values(): #TODO: put in seperate function - get all columns in JSON file object
-		if type(v) == str:
-			summary_cols_all.append(v)
-		else:
-			summary_cols_all += v
-	output_version = summary_config["version"]
-	summary_df = summary_df[summary_cols_all] #select all relevant columns 
-	summary_df = summary_df.rename(columns={summary_config["columns"]["concept_set_name"]: "CONCEPT_SET"})
 	summary_df = summary_df.drop_duplicates() #remove duplicates
-	out = out.merge(summary_df, how="left", on='CONCEPT_SET')
+	out = out.merge(summary_df, how="left", on='CONCEPT_SET') #merge with output
 	# Save Output File
 	print(bcolors.HEADER, "---"*5, "OUTPUT", "---"*5, bcolors.ENDC)

--- a/process_codes_WP.ipynb
+++ b/process_codes_WP.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "id": "8c8f4cdf-04a5-4762-895e-6555781a1f05",
   "metadata": {},
   "outputs": [],
@@ -113,163 +113,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": null,
   "id": "f155b635-b459-4aff-81b2-e065fc223858",
   "metadata": {},
-   "outputs": [
+   "outputs": [],
-    {
-     "data": {
-      "text/plain": [
-       "0    False\n",
-       "dtype: bool"
-      ]
-     },
-     "execution_count": 64,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
   "source": []
  },
  {
   "cell_type": "code",
-   "execution_count": 94,
+   "execution_count": null,
   "id": "d040eda5-4028-4047-834c-7315e307e415",
   "metadata": {},
-   "outputs": [
+   "outputs": [],
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>icd10_code</th>\n",
-       "      <th>icd10_alt_code</th>\n",
-       "      <th>description</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>A00</td>\n",
-       "      <td>A00</td>\n",
-       "      <td>Cholera</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>A00.0</td>\n",
-       "      <td>A000</td>\n",
-       "      <td>Cholera due to Vibrio cholerae 01, biovar chol...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>A00.1</td>\n",
-       "      <td>A001</td>\n",
-       "      <td>Cholera due to Vibrio cholerae 01, biovar eltor</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>A00.9</td>\n",
-       "      <td>A009</td>\n",
-       "      <td>Cholera, unspecified</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>A01</td>\n",
-       "      <td>A01</td>\n",
-       "      <td>Typhoid and paratyphoid fevers</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>17929</th>\n",
-       "      <td>U84.3</td>\n",
-       "      <td>U843</td>\n",
-       "      <td>Resistance to tuberculostatic drug(s)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>17930</th>\n",
-       "      <td>U84.7</td>\n",
-       "      <td>U847</td>\n",
-       "      <td>Resistance to multiple antimicrobial drugs</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>17931</th>\n",
-       "      <td>U84.8</td>\n",
-       "      <td>U848</td>\n",
-       "      <td>Resistance to other specified antimicrobial drug</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>17932</th>\n",
-       "      <td>U84.9</td>\n",
-       "      <td>U849</td>\n",
-       "      <td>Resistance to unspecified antimicrobial drugs</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>17933</th>\n",
-       "      <td>U85</td>\n",
-       "      <td>U85X</td>\n",
-       "      <td>Resistance to antineoplastic drugs</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>17934 rows × 3 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "      icd10_code icd10_alt_code  \\\n",
-       "0            A00            A00   \n",
-       "1          A00.0           A000   \n",
-       "2          A00.1           A001   \n",
-       "3          A00.9           A009   \n",
-       "4            A01            A01   \n",
-       "...          ...            ...   \n",
-       "17929      U84.3           U843   \n",
-       "17930      U84.7           U847   \n",
-       "17931      U84.8           U848   \n",
-       "17932      U84.9           U849   \n",
-       "17933        U85           U85X   \n",
-       "\n",
-       "                                             description  \n",
-       "0                                                Cholera  \n",
-       "1      Cholera due to Vibrio cholerae 01, biovar chol...  \n",
-       "2        Cholera due to Vibrio cholerae 01, biovar eltor  \n",
-       "3                                   Cholera, unspecified  \n",
-       "4                         Typhoid and paratyphoid fevers  \n",
-       "...                                                  ...  \n",
-       "17929              Resistance to tuberculostatic drug(s)  \n",
-       "17930         Resistance to multiple antimicrobial drugs  \n",
-       "17931   Resistance to other specified antimicrobial drug  \n",
-       "17932      Resistance to unspecified antimicrobial drugs  \n",
-       "17933                 Resistance to antineoplastic drugs  \n",
-       "\n",
-       "[17934 rows x 3 columns]"
-      ]
-     },
-     "execution_count": 94,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
   "source": [
    "df = pd.read_parquet(\"maps/processed/icd10_code.parquet\")\n",
    "df\n"
@@ -277,35 +132,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 113,
+   "execution_count": null,
   "id": "e0228ac9-8852-4818-b7f0-98429ca5229c",
   "metadata": {},
-   "outputs": [
+   "outputs": [],
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0     True\n",
-      "1    False\n",
-      "dtype: bool\n",
-      "0    False\n",
-      "1    False\n",
-      "dtype: bool\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "0     True\n",
-       "1    False\n",
-       "dtype: bool"
-      ]
-     },
-     "execution_count": 113,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
   "source": [
    "code = [\"A00.0\", \"*00.0\"]\n",
    "code = pd.Series(code)\n",
@@ -332,29 +162,35 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 119,
+   "execution_count": null,
-   "id": "4c9b6b3f-08aa-4f61-b9b2-44a24b5d00a0",
+   "id": "85dc197b-451e-4fa9-a53b-e6770c132123",
   "metadata": {},
-   "outputs": [
+   "outputs": [],
-    {
+   "source": [
-     "name": "stdout",
+    "import json\n",
-     "output_type": "stream",
+    "import os\n",
-     "text": [
+    "\n",
-      "ALL FILES 878 878\n",
+    "path_json = \"../concepts/PHEN_assign_v3.json\"\n",
-      "JSON CONCEPTS 436 397\n",
+    "\n",
-      "EXCEL CONCEPTS 440 397\n",
+    "#Load JSON Concept Definitions\n",
-      "1755 878\n"
+    "mapping = json.load(open(path_json,'rb'))\n",
+    "summary_config = mapping[\"concept_sets\"][\"concept_set\"]\n",
+    "summary_df = pd.DataFrame(summary_config) #change to dataframe\n",
+    "\n",
+    "summary_df = summary_df.join(pd.json_normalize(summary_df[\"metadata\"])) #metadata to columns\n",
+    "summary_df = summary_df.drop(columns=[\"metadata\"])\n",
+    "summary_df = summary_df.rename(columns={\"concept_set_name\":\"CONCEPT_SET\"})\n",
+    "summary_df = summary_df.drop_duplicates() #remove duplicates\n",
+    " \n",
+    "summary_df\n"
   ]
  },
  {
-     "name": "stderr",
+   "cell_type": "code",
-     "output_type": "stream",
+   "execution_count": null,
-     "text": [
+   "id": "4c9b6b3f-08aa-4f61-b9b2-44a24b5d00a0",
-      "/opt/conda/lib/python3.9/site-packages/openpyxl/worksheet/_reader.py:329: UserWarning: Data Validation extension is not supported and will be removed\n",
+   "metadata": {},
-      "  warn(msg)\n"
+   "outputs": [],
-     ]
-    }
-   ],
   "source": [
    "import json\n",
    "import os\n",
@@ -438,111 +274,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 120,
+   "execution_count": null,
   "id": "f8e70c33-c869-46f8-953e-f6b52992cfbb",
   "metadata": {},
-   "outputs": [
+   "outputs": [],
-    {
-     "data": {
-      "text/plain": [
-       "'JSON MISSING'"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>filepath</th>\n",
-       "      <th>json_concept</th>\n",
-       "      <th>json_code_types</th>\n",
-       "      <th>excel_concept</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "Empty DataFrame\n",
-       "Columns: [filepath, json_concept, json_code_types, excel_concept]\n",
-       "Index: []"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "'EXCEL MISSING'"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>filepath</th>\n",
-       "      <th>json_concept</th>\n",
-       "      <th>json_code_types</th>\n",
-       "      <th>excel_concept</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "Empty DataFrame\n",
-       "Columns: [filepath, json_concept, json_code_types, excel_concept]\n",
-       "Index: []"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
   "source": [
    "display(\"JSON MISSING\", outs[outs[\"json_concept\"].isna() & outs[\"excel_concept\"].notna()])\n",
    "display(\"EXCEL MISSING\", outs[outs[\"json_concept\"].notna() & outs[\"excel_concept\"].isna()])"
@@ -550,7 +285,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 118,
+   "execution_count": null,
   "id": "9d84465f-f064-4df2-b0e4-2dfb217aea21",
   "metadata": {},
   "outputs": [],
@@ -567,21 +302,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
   "id": "7f7fc771-e406-42c7-8a09-16a20b5298f5",
   "metadata": {},
-   "outputs": [
+   "outputs": [],
-    {
-     "data": {
-      "text/plain": [
-       "65307"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
   "source": [
    "total_length = 0\n",
    "for file in all_files[\"filepath\"]:\n",
@@ -620,6 +344,7 @@
   "cell_type": "markdown",
   "id": "357bb84c-90c2-4b5f-95c0-443191783a7f",
   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
@@ -628,48 +353,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "id": "7d3f9cb7-be86-4f1f-92f6-991094eb7bb7",
   "metadata": {},
-   "outputs": [
+   "outputs": [],
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "--------- output/V2_2_2_MELD_concepts_readv2.csv ---------\n",
-      "MELDB missing concepts  0\n",
-      "Chars present: ['.' '0' '1' '2' '3' '4' '5' '6' '7' '8' '9' 'A' 'B' 'C' 'D' 'E' 'F' 'G'\n",
-      " 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' 'X' 'Y'\n",
-      " 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q'\n",
-      " 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']\n",
-      "--------- output/V2_2_2_MELD_snomed_no_translate.csv ---------\n",
-      "MELDB missing concepts  0\n",
-      "--------- output/V2_2_2_MELD_icd10_no_translate.csv ---------\n",
-      "MELDB missing concepts  0\n",
-      "Chars present: ['0' '1' '2' '3' '4' '5' '6' '7' '8' '9' 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H'\n",
-      " 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'T' 'W' 'X' 'Y' 'Z']\n",
-      "--------- output/V2_2_2_MELD_atc_no_translate.csv ---------\n",
-      "MELDB missing concepts  0\n",
-      "Chars present: ['0' '1' '2' '3' '6' 'A' 'F' 'N' 'X']\n",
-      "--------- output/V2_2_2_MELD_errors.csv ---------\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "CODE_TYPE\n",
-       "snomed_code    1261\n",
-       "read2_code      464\n",
-       "read3_code       80\n",
-       "icd10_code        1\n",
-       "Name: count, dtype: int64"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
   "source": [
    "version = \"V2_2_2\"\n",
    "output_files = [f\"output/{version}_MELD_concepts_readv2.csv\",\n",
@@ -700,141 +387,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 209,
+   "execution_count": null,
   "id": "08e0ecc1-9271-48c3-9c5b-094800072906",
   "metadata": {},
-   "outputs": [
+   "outputs": [],
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "## Compare Concepts V2_1_4 to V2_2_3\n",
-      "output/V2_1_4_MELD_concepts_readv2.csv output/V2_2_3_MELD_concepts_readv2.csv\n",
-      "- Removed Concepts ['THYROID_DISEASE', 'SCHIZOPHRENIA_BIPOLAR_DISORDER', 'PSIORIASIS_ECZEMA', 'HAEMATOLOGICAL_CANCERS', 'INFLAMM_ARTHROPATHIES', 'ALL_CANCER', 'STROKE_TIA', 'DIABETES', 'PMR_AND_GCD', 'LONG_TERM_MS_PROBLEMS', 'ALL_CKD', 'INFLAMM_ARTHROPATHIES_CONNECTIVE_TISSUE_DIS', 'RENAL_TRANSPLANT_DIALYSIS']\n",
-      "- Added Concepts []\n",
-      "- Changed Concepts \n",
-      "\t - ANXIETY -7.0\n",
-      "\t - ARRHYTHMIA -1.0\n",
-      "\t - ASTHMA -1.0\n",
-      "\t - AUTISM_AND_ADHD -4.0\n",
-      "\t - BIPOLAR_DISORDER -1.0\n",
-      "\t - BLINDNESS_AND_LOW_VISION -3.0\n",
-      "\t - COELIAC_DISEASE -1.0\n",
-      "\t - CORONARY_HEART_DISEASE -8.0\n",
-      "\t - DEAFNESS -33.0\n",
-      "\t - DEMENTIA_ALZHEIMER -2.0\n",
-      "\t - DEPRESSION -5.0\n",
-      "\t - DIABETES_T1 -1.0\n",
-      "\t - DIABETES_T2 -1.0\n",
-      "\t - DIALYSIS -14.0\n",
-      "\t - DIVERTICULAR_DISEASE -11.0\n",
-      "\t - DRUG_ALCOHOL_MISUSE -3.0\n",
-      "\t - EATING_DISORDERS -2.0\n",
-      "\t - EPILEPSY -1.0\n",
-      "\t - FATIGUE -27.0\n",
-      "\t - HEADACHE -48.0\n",
-      "\t - HF -3.0\n",
-      "\t - INCONTINENCE -21.0\n",
-      "\t - LEARNING_DISABILITY -3.0\n",
-      "\t - MSK_PAIN -36.0\n",
-      "\t - MULTIPLE_SCLEROSIS -1.0\n",
-      "\t - PALLIATIVE_CARE -8.0\n",
-      "\t - PLASMACELL -1.0\n",
-      "\t - PTSD -1.0\n",
-      "\t - SCHIZOPHRENIA -1.0\n",
-      "\t - SELF_HARM -37.0\n",
-      "\t - SLEEP_PROBLEMS -74.0\n",
-      "\t - STRESS -31.0\n",
-      "\t - SYSTEMIC_LUPUS_ERYTHEMATOSUS -2.0\n",
-      "\n",
-      "output/V2_1_4_MELD_snomed_no_translate.csv output/V2_2_3_MELD_snomed_no_translate.csv\n",
-      "- Removed Concepts ['THYROID_DISEASE', 'SCHIZOPHRENIA_BIPOLAR_DISORDER', 'PSIORIASIS_ECZEMA', 'HAEMATOLOGICAL_CANCERS', 'INFLAMM_ARTHROPATHIES', 'ALL_CANCER', 'STROKE_TIA', 'DIABETES', 'PMR_AND_GCD', 'LONG_TERM_MS_PROBLEMS', 'ALL_CKD', 'INFLAMM_ARTHROPATHIES_CONNECTIVE_TISSUE_DIS', 'RENAL_TRANSPLANT_DIALYSIS']\n",
-      "- Added Concepts []\n",
-      "- Changed Concepts \n",
-      "\t - ANAEMIA -2.0\n",
-      "\t - ANEURYSM -3.0\n",
-      "\t - ANXIETY -7.0\n",
-      "\t - ARRHYTHMIA -25.0\n",
-      "\t - ASTHMA -34.0\n",
-      "\t - ATOPIC_ECZEMA -6.0\n",
-      "\t - AUTISM_AND_ADHD -2.0\n",
-      "\t - BIPOLAR_DISORDER -3.0\n",
-      "\t - BLINDNESS_AND_LOW_VISION -4.0\n",
-      "\t - BREAST_CANCER -2.0\n",
-      "\t - BRONCHIECSTASIS -1.0\n",
-      "\t - CHRONIC_BACK_PAIN -1.0\n",
-      "\t - CHRONIC_FATIGUE_SYNDROME -3.0\n",
-      "\t - CHRONIC_LIVER_DISEASE -14.0\n",
-      "\t - CHRONIC_PAIN -2.0\n",
-      "\t - CKD_STAGE3_5 -3.0\n",
-      "\t - COELIAC_DISEASE -6.0\n",
-      "\t - COLON_CANCER -6.0\n",
-      "\t - CONGENITAL_DIS_CHROMOSOMAL_ABNORMALITIES -1.0\n",
-      "\t - COPD -31.0\n",
-      "\t - CORONARY_HEART_DISEASE -21.0\n",
-      "\t - CYSTIC_FIBROSIS -24.0\n",
-      "\t - DEAFNESS -15.0\n",
-      "\t - DEMENTIA_ALZHEIMER -111.0\n",
-      "\t - DEPRESSION -34.0\n",
-      "\t - DIABETES_T2 -2.0\n",
-      "\t - DIABETIC_RETINOPATHY -13.0\n",
-      "\t - DIALYSIS -1.0\n",
-      "\t - DIVERTICULAR_DISEASE -4.0\n",
-      "\t - DRUG_ALCOHOL_MISUSE -310.0\n",
-      "\t - EATING_DISORDERS -4.0\n",
-      "\t - ENDOMETRIOSIS -1.0\n",
-      "\t - EPILEPSY -11.0\n",
-      "\t - GLAUCOMA -3.0\n",
-      "\t - GOUT -4.0\n",
-      "\t - HEART_VALVE_DISORDERS -6.0\n",
-      "\t - HF -4.0\n",
-      "\t - HIVAIDS -18.0\n",
-      "\t - HYPERTENSION -11.0\n",
-      "\t - HYPERTHYROIDISM -1.0\n",
-      "\t - HYPOTHYROIDISM -8.0\n",
-      "\t - IBD -2.0\n",
-      "\t - ILD -2.0\n",
-      "\t - LEARNING_DISABILITY -40.0\n",
-      "\t - LEUKAEMIA -1.0\n",
-      "\t - LYMPHOMA -2.0\n",
-      "\t - MENIERES_DISEASE -1.0\n",
-      "\t - METASTATIC_CANCER -3.0\n",
-      "\t - MOBILITY_PROBLEMS -45.0\n",
-      "\t - MULTIPLE_SCLEROSIS -13.0\n",
-      "\t - OBESITY -63.0\n",
-      "\t - OSTEOARTHRITIS -3.0\n",
-      "\t - OSTEOPOROSIS -4.0\n",
-      "\t - PARALYSIS -3.0\n",
-      "\t - PARKINSONS -2.0\n",
-      "\t - PLASMACELL -1.0\n",
-      "\t - PROSTATE_CANCER -2.0\n",
-      "\t - PROSTATE_DISORDERS -2.0\n",
-      "\t - PSORIASIS -3.0\n",
-      "\t - PTSD -38.0\n",
-      "\t - RENAL_TRANSPLANT -1.0\n",
-      "\t - RHEUMATOID_ARTHRITIS -8.0\n",
-      "\t - SCHIZOPHRENIA -85.0\n",
-      "\t - SKIN_CANCER -4.0\n",
-      "\t - STROKE -4.0\n",
-      "\t - SYSTEMIC_LUPUS_ERYTHEMATOSUS -1.0\n",
-      "\t - TIA -1.0\n",
-      "\t - VIRAL_HEPATITIS -9.0\n",
-      "\t - VTD -5.0\n",
-      "\n",
-      "output/V2_1_4_MELD_icd10_no_translate.csv output/V2_2_3_MELD_icd10_no_translate.csv\n",
-      "- Removed Concepts ['THYROID_DISEASE', 'SCHIZOPHRENIA_BIPOLAR_DISORDER', 'PSIORIASIS_ECZEMA', 'HAEMATOLOGICAL_CANCERS', 'INFLAMM_ARTHROPATHIES', 'ALL_CANCER', 'STROKE_TIA', 'DIABETES', 'PMR_AND_GCD', 'LONG_TERM_MS_PROBLEMS', 'ALL_CKD', 'INFLAMM_ARTHROPATHIES_CONNECTIVE_TISSUE_DIS']\n",
-      "- Added Concepts []\n",
-      "- Changed Concepts \n",
-      "\t - CVD_EVENTS -1.0\n",
-      "\n",
-      "output/V2_1_4_MELD_atc_no_translate.csv output/V2_2_3_MELD_atc_no_translate.csv\n",
-      "- Removed Concepts []\n",
-      "- Added Concepts []\n",
-      "- Changed Concepts \n",
-      "\n"
-     ]
-    }
-   ],
   "source": [
    "def get_output_files(version):\n",
    "    output_files = [f\"output/{version}_MELD_concepts_readv2.csv\",\n",
@@ -884,6 +440,33 @@
    "    "
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc60c137-5a85-4155-af6b-6796f8c05980",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.read_csv(\"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/PHEN_summary_working.csv\")\n",
+    "df = df.set_index(\"#\")\n",
+    "\n",
+    "for vocab in [\"atc\", \"icd10\", \"readv2\", \"snomed\"]:\n",
+    "    df[vocab.upper()] = \"\"\n",
+    "\n",
+    "    for file in glob.glob(f\"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/{vocab}/*.csv\"):\n",
+    "        concept_set = os.path.basename(file)[:-4]\n",
+    "        row_index = df[df[\"CONCEPT NAME \"] == concept_set].index[0]\n",
+    "\n",
+    "        df.loc[row_index, vocab.upper()] = \"YES\"\n",
+    "\n",
+    "df = df.drop(columns=[\"READv2_CODE\", \"ICD10_CODE\"])\n",
+    "df.to_csv(\"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/PHEN_summary_working_labelled.csv\")"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "e5c4291f-847b-4c82-976e-bd5b3a7b6bcc",
@@ -1095,7 +678,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 170,
+   "execution_count": null,
   "id": "a968ffb1-4337-456b-8d20-419888b4044f",
   "metadata": {},
   "outputs": [],
@@ -1112,7 +695,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 184,
+   "execution_count": null,
   "id": "c70b1ce2-0f41-4d02-ad17-6fc44bc3c6bf",
   "metadata": {},
   "outputs": [],
@@ -1125,127 +708,558 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 194,
+   "execution_count": null,
   "id": "d5d34237-02d4-4dea-8c20-5adaf337f6b5",
   "metadata": {},
-   "outputs": [
+   "outputs": [],
+   "source": [
+    "df1.merge(df2, how='inner', on='snomed_code')"
+   ]
+  },
  {
-     "data": {
+   "cell_type": "code",
-      "text/html": [
+   "execution_count": null,
-       "<div>\n",
+   "id": "b3166cf0-e4a5-43e0-aeac-78827427422e",
-       "<style scoped>\n",
+   "metadata": {},
-       "    .dataframe tbody tr th:only-of-type {\n",
+   "outputs": [],
-       "        vertical-align: middle;\n",
+   "source": [
-       "    }\n",
+    ".astype(str).dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c0a766f9-7959-4a10-b58f-cd946a878b60",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(\"../concepts/PHEN_summary_working.csv\")\n",
+    "cols = list(df.columns)\n",
+    "cols.remove('CONCEPT NAME ')\n",
+    "cols.remove('AGREED')\n",
+    "df = df.applymap(lambda x: str(x) if isinstance(x, (int, float)) else x) #change to int\n",
+    "\n",
+    "df_copy = df.rename(columns={\n",
+    "    \"CONCEPT NAME \":\"concept_set_name\",\n",
+    "    \"AGREED\":\"concept_set_status\"\n",
+    "})\n",
+    "df_copy[\"concept_set_status\"] = df_copy[\"concept_set_status\"].replace(\"USE\", \"AGREED\")\n",
+    "df_copy = df_copy[[\"concept_set_name\", \"concept_set_status\"]]\n",
+    "outs = df_copy.to_dict(orient='records')\n",
    "\n",
-       "    .dataframe tbody tr th {\n",
+    "for i, out in enumerate(outs):\n",
-       "        vertical-align: top;\n",
+    "    out[\"metadata\"] = dict(df[cols].iloc[i])\n",
-       "    }\n",
    "    \n",
-       "    .dataframe thead th {\n",
+    "json.dumps(outs)\n"
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>bnf_code</th>\n",
-       "      <th>snomed_code</th>\n",
-       "      <th>read2_code</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "Empty DataFrame\n",
-       "Columns: [bnf_code, snomed_code, read2_code]\n",
-       "Index: []"
   ]
  },
-     "execution_count": 194,
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a204a95-dc4c-4183-9ea7-f5c5e95e9087",
   "metadata": {},
-     "output_type": "execute_result"
+   "outputs": [],
-    }
+   "source": []
-   ],
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5ce1ab58-50b4-4c22-b72b-c698de6830f7",
+   "metadata": {},
+   "outputs": [],
   "source": [
-    "df1.merge(df2, how='inner', on='snomed_code')"
+    "import json"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 180,
+   "execution_count": null,
-   "id": "d0cbadfe-ef55-40a8-a0f1-a9fc69d7456b",
+   "id": "f1ea81c6-d1db-408f-9d3a-b96f44efe21f",
   "metadata": {},
-   "outputs": [
+   "outputs": [],
+   "source": []
+  },
  {
-     "data": {
+   "cell_type": "markdown",
-      "text/html": [
+   "id": "5eb544a3-9dd1-41e8-88c2-a808646c6112",
-       "<div>\n",
+   "metadata": {
-       "<style scoped>\n",
+    "jp-MarkdownHeadingCollapsed": true,
-       "    .dataframe tbody tr th:only-of-type {\n",
+    "tags": []
-       "        vertical-align: middle;\n",
+   },
-       "    }\n",
+   "source": [
+    "### OMOP Database"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9e58e62-9e44-4d0c-9d8d-35c175c07e6c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sqlite3\n",
+    "import csv\n",
+    "import pandas as pd\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4f67c9a1-373f-4799-8a85-72767662d912",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d0ecdf69-ee90-42c1-ad25-d8357b603d1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#IMPORT OMOP VOCABS\n",
+    "conn = sqlite3.connect(\"codes/omop_54.sqlite\") # change to 'sqlite:///your_filename.db'\n",
+    "folder_path = \"codes/vocabulary_download_v5_{9424944c-2b76-4127-8f05-f535e0f15e2a}_1731661390540\"\n",
+    "\n",
+    "# Check if the folder exists\n",
+    "if not os.path.isdir(folder_path):\n",
+    "    raise Exception(f\"Error: The folder '{folder_path}' does not exist.\")  \n",
+    "\n",
+    "# Iterate through files in the folder\n",
+    "for filename in os.listdir(folder_path):\n",
+    "    if filename.endswith(\".csv\"):  # Check if the file is a CSV\n",
+    "        file_path = os.path.join(folder_path, filename)\n",
+    "        try:\n",
+    "            print(f\"Reading file: {file_path}\")\n",
+    "            # Read the CSV file with the specified delimiter\n",
+    "            df = pd.read_csv(file_path, delimiter=\"\\t\", low_memory=False)\n",
+    "            table_name = os.path.splitext(os.path.basename(file_path))[0] #Get name of file\n",
+    "            \n",
+    "            #Export Table to sqlite db\n",
+    "            df.to_sql(table_name, conn, if_exists='replace', index=False)\n",
+    "            \n",
+    "        except Exception as e:\n",
+    "             raise Exception(f\"Error reading file {file_path}: {e}\")\n",
+    "\n",
+    "conn.commit()\n",
+    "conn.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b9cafd0c-a3bd-408b-bca8-b0de2acde1cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a SQL connection to our SQLite database\n",
+    "conn = sqlite3.connect(\"codes/omop_54.sqlite\")\n",
+    "cur = conn.cursor()\n",
+    "\n",
+    "#Print ALL Columns in Table\n",
+    "# table=\"CONCEPT_SET\"\n",
+    "# cur.execute(f\"PRAGMA table_info({table});\")\n",
+    "# print(pd.DataFrame(cur.fetchall()))\n",
+    "\n",
+    "#Print ALL TABLE NAMES\n",
+    "# cur.execute(\"SELECT name FROM sqlite_master WHERE type='table' AND name=? ;\", (\"VOCABULARY\",))\n",
+    "# print(cur.fetchone())\n",
+    "            \n",
+    "cur.execute(\"SELECT vocabulary_id FROM VOCABULARY WHERE vocabulary_id=? ;\", (\"MELDB\",))\n",
+    "print(cur.fetchone())\n",
+    "\n",
+    "    \n",
+    "    \n",
+    "#Print WHOLE TABLE\n",
+    "# cur.execute('SELECT * FROM CONCEPT;')\n",
+    "# cur.execute('SELECT * FROM CONCEPT WHERE standard_concept = \"C\";')\n",
+    "# cur.execute('SELECT * FROM CONCEPT WHERE concept_code = \"119768002\" LIMIT 1;')\n",
+    "# cur.execute('SELECT * FROM CONCEPT WHERE concept_code IN (\"119768002\", \"5905001\");')\n",
+    "# cur.execute('SELECT DISTINCT VOCABULARY_ID FROM CONCEPT;')\n",
+    "# df = pd.DataFrame(cur.fetchall())\n",
+    "# print(list(df[0]))\n",
+    "# display(df)\n",
+    "# for row in :\n",
+    "    # print(row)\n",
+    "\n",
+    "\n",
+    "\n",
+    "#Get Header of Table\n",
+    "# table=\"CONCEPT_CLASS\"\n",
+    "# cur.execute(f\"SELECT * FROM {table} LIMIT 3;\")\n",
+    "# print(cur.fetchall())\n",
+    "\n",
+    "#create meldb VOCABULARY\n",
+    "# meldb_version='v3.2.10'\n",
+    "# meldb_description = 'Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity'\n",
+    "# meldb_reference = 'https://www.it-innovation.soton.ac.uk/projects/meldb'\n",
+    "# df_test = pd.DataFrame([{\n",
+    "#     \"vocabulary_id\": 'MELDB',\n",
+    "#     \"vocabulary_name\": meldb_description,\n",
+    "#     \"vocabulary_reference\": meldb_reference,\n",
+    "#     \"vocabulary_version\": meldb_version,\n",
+    "#     # \"vocabulary_concept_id\": 0,\n",
+    "# }])\n",
+    "# df_test.to_sql(\"VOCABULARY\", conn, if_exists='append', index=False)\n",
+    "\n",
+    "\n",
+    "# cur.execute(\"\"\"\n",
+    "# CREATE TABLE CONCEPT_SET (\n",
+    "#     concept_set_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each concept set\n",
+    "#     atlas_id INTEGER,                                -- Unique identifier generated by ATLAS\n",
+    "#     concept_set_name TEXT,                           -- Optional name for the concept set\n",
+    "#     concept_set_description TEXT,                    -- Optional description for the concept set\n",
+    "#     vocabulary_id TEXT NOT NULL,                     -- Foreign key to VOCABULARY table\n",
+    "#     FOREIGN KEY (vocabulary_id) REFERENCES VOCABULARY(vocabulary_id)\n",
+    "# );\"\"\")\n",
+    "# cur.execute(\"DROP TABLE CONCEPT_SET;\")\n",
+    "\n",
+    "# cur.execute(\"\"\"\n",
+    "# CREATE TABLE CONCEPT_SET_ITEM (\n",
+    "#     concept_set_item_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each mapping\n",
+    "#     concept_set_id INTEGER NOT NULL,                      -- Foreign key to CONCEPT_SET table\n",
+    "#     concept_id INTEGER NOT NULL,                          -- Foreign key to CONCEPT table\n",
+    "#     FOREIGN KEY (concept_set_id) REFERENCES CONCEPT_SET(concept_set_id),\n",
+    "#     FOREIGN KEY (concept_id) REFERENCES CONCEPT(concept_id)\n",
+    "# );\"\"\")\n",
+    "# cur.execute(\"DROP TABLE CONCEPT_SET_ITEM;\")\n",
+    "\n",
+    "# Be sure to close the connection\n",
+    "conn.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d03b75f3-902f-42d7-b52f-dac7e79ecb11",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "conn = sqlite3.connect(\"codes/omop_54.sqlite\") # change to 'sqlite:///your_filename.db'\n",
+    "cur = conn.cursor()\n",
    "\n",
-       "    .dataframe tbody tr th {\n",
+    "file_path = \"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/HEART_VALVE_DISORDERS.csv\"\n",
-       "        vertical-align: top;\n",
+    "df = pd.read_csv(file_path, low_memory=False)\n",
-       "    }\n",
+    "df = df.set_index(\"code\")\n",
+    "\n",
+    "df.to_sql(name='test', con=conn, if_exists='replace')\n",
+    "\n",
+    "conn.commit()\n",
+    "conn.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d96c3511-3831-400e-ba40-0a36abcc60d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#DISPLAY SQL TABLE\n",
+    "table=\"CONCEPT_SET_ITEM\"\n",
+    "\n",
+    "# Create a SQL connection to our SQLite database\n",
+    "conn = sqlite3.connect(\"codes/omop_54.sqlite\")\n",
+    "cur = conn.cursor()\n",
+    "\n",
+    "#Print ALL Columns in Table\n",
+    "cur.execute(f\"PRAGMA table_info({table});\")\n",
+    "df_cols = pd.DataFrame(cur.fetchall())\n",
+    "print(df_cols)\n",
+    "df_cols = df_cols[1]\n",
+    "\n",
+    "#Print TABLE\n",
+    "cur.execute(f\"SELECT * FROM {table};\")\n",
+    "df = pd.DataFrame(cur.fetchall())\n",
+    "df = df.rename(columns={i:s for i, s in enumerate(df_cols)})\n",
+    "display(df)\n",
+    "\n",
+    "conn.close()\n",
+    "\n",
+    "\n",
+    "# a+s = 13364 \n",
+    "# a+s+i = 13591\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "42d49a00-9646-4ba4-afb6-12297289b7a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def sql_row_exist(conn, table, column, value):\n",
+    "\t# Execute and check if a result exists\n",
+    "\tcur = conn.cursor()\n",
+    "\tquery = f\"SELECT 1 FROM {table} WHERE {column} = ? LIMIT 1;\"\n",
+    "\tcur.execute(query, (value,))\n",
+    "\texists = cur.fetchone() is not None\n",
+    "\t\n",
+    "\treturn exists"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f7b51bcd-6ee1-4023-8d36-7f419ce4120d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#EXPORT MELDB CSV OUTPUT\n",
+    "\n",
+    "conn = sqlite3.connect(\"codes/omop_54.sqlite\") # change to 'sqlite:///your_filename.db'\n",
+    "cur = conn.cursor()\n",
+    "\n",
+    "vocab_output = \"MELDB\"\n",
+    "vocab_type = \"SNOMED\"\n",
+    "file_path = \"/home/jjd1c23/ssd/meldb/jjd1c23/phenotype/output/V3_2_10_MELD_snomed_no_translate.csv\"\n",
+    "# file_path = \"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/HEART_VALVE_DISORDERS.csv\"\n",
+    "\n",
+    "# Read the CSV file with the specified delimiter\n",
+    "out = pd.read_csv(file_path, low_memory=False)\n",
+    "print(df.columns)\n",
+    "\n",
+    "for concept_set_name, grp in out.groupby(\"MELDB_concept\"):\n",
+    "    # display(concept_set_name, grp[[\"code\", \"MELDB_concept\"]])\n",
+    "    \n",
+    "    #Create Concept_Set\n",
+    "    if not sql_row_exist(conn, \"CONCEPT_SET\", \"concept_set_name\", concept_set_name):\n",
+    "        cur.execute(f\"INSERT INTO CONCEPT_SET (concept_set_name, vocabulary_id) VALUES ('{concept_set_name}', 'MELDB');\")\n",
+    "    else:\n",
+    "        print(\"concept_set\", concept_set_name, \"already exists\")\n",
+    "        #TODO: ask to remove old concept_set?\n",
+    "    \n",
+    "    #Get Concept_set_Id\n",
+    "    query = \"SELECT concept_set_id FROM CONCEPT_SET WHERE concept_set_name = ? AND vocabulary_id = ?;\"\n",
+    "    cur.execute(query, (concept_set_name, vocab_output, ))    \n",
+    "    concept_set_id = cur.fetchone()[0]\n",
    "    \n",
-       "    .dataframe thead th {\n",
+    "    #Get corresponing Concept_id (OMOP) for each Concept_code (e.g. SNOMED)\n",
-       "        text-align: right;\n",
+    "    concept_codes = \"'\"+\"', '\".join(list(grp[\"code\"].astype(str)))+\"'\"\n",
+    "    query = f\"SELECT concept_id FROM CONCEPT WHERE vocabulary_id = ? AND concept_code IN ({concept_codes});\"\n",
+    "    print(query)\n",
+    "    cur.execute(query, (vocab_type, ))\n",
+    "    df_out = pd.DataFrame(cur.fetchall(), columns=[\"concept_id\"])\n",
+    "    \n",
+    "    if not len(grp) == len(df_out):\n",
+    "        print(\"ERROR: Some\", vocab_type, \"Codes do not exist in OMOP Database\")\n",
+    "    \n",
+    "    #Create Concept_set_item\n",
+    "    df_out[\"concept_set_id\"] = concept_set_id\n",
+    "    df_out.to_sql(\"CONCEPT_SET_ITEM\", conn, if_exists='append', index=False)\n",
+    "    \n",
+    "    display(df_out)\n",
+    "    \n",
+    "        \n",
+    "    \n",
+    "    # break\n",
+    "    \n",
+    "    \n",
+    "\n",
+    "#         #Create New CONCEPT_SET\n",
+    "#         table_name = os.path.splitext(os.path.basename(file_path))[0] #Get name of file\n",
+    "#         cur.execute(f\"INSERT INTO CONCEPT_SET (concept_class_name) VALUES ('{table_name}');\")\n",
+    "        \n",
+    "        \n",
+    "        \n",
+    "\n",
+    "        \n",
+    "        \n",
+    "\n",
+    "\n",
+    "conn.commit()\n",
+    "conn.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "85007741-e34c-4112-a63c-9fb302b76958",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"'\"+\"', '\".join(list(grp[\"code\"].astype(str)))+\"'\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "423e7c21-f3bd-439d-9dcb-c17cc2cc6854",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true,
+    "tags": []
+   },
+   "source": [
+    "### ATLAS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6b45e4d-c7d2-42e7-9b4a-0e9c1c86d34b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Create ATLAS Concept Set\n",
+    "\n",
+    "def atlas_create_concept(name, description=\"\", items=[]):\n",
+    "    data={\n",
+    "      \"id\": 0,\n",
+    "      \"name\": name,\n",
+    "      \"description\": description,\n",
+    "      \"expression\": {\n",
+    "          \"items\":items \n",
    "      }\n",
-       "</style>\n",
+    "    }\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
+    "\n",
-       "  <thead>\n",
+    "    try:\n",
-       "    <tr style=\"text-align: right;\">\n",
+    "        # Sending the POST request\n",
-       "      <th></th>\n",
+    "        response = requests.post(url, json=data, headers=headers)\n",
-       "      <th>bnf_code</th>\n",
+    "\n",
-       "      <th>snomed_code</th>\n",
+    "        # Check the response status\n",
-       "      <th>read2_code</th>\n",
+    "        if response.status_code == 200 or response.status_code == 201:\n",
-       "    </tr>\n",
+    "            print(\"POST request successful:\")\n",
-       "  </thead>\n",
+    "            print(response.json())  # Assuming the response is JSON\n",
-       "  <tbody>\n",
+    "            return response[\"id\"]\n",
-       "  </tbody>\n",
+    "        else:\n",
-       "</table>\n",
+    "            print(f\"POST request failed. HTTP Status Code: {response.status_code}\")\n",
-       "</div>"
+    "            print(\"Response content:\")\n",
-      ],
+    "            print(response.text)\n",
-      "text/plain": [
+    "            return None\n",
-       "Empty DataFrame\n",
+    "\n",
-       "Columns: [bnf_code, snomed_code, read2_code]\n",
+    "    except requests.exceptions.RequestException as e:\n",
-       "Index: []"
+    "        print(f\"An error occurred: {e}\")\n",
+    "\n",
+    "# Heart Test 1 - 1885487\n",
+    "# Heart Test 2 - 1885488\n",
+    "# Heart Valve Disorders - 1885449\n",
+    "\n"
   ]
  },
-     "execution_count": 180,
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "45497623-1da0-4f74-b21e-da8811c89b04",
   "metadata": {},
-     "output_type": "execute_result"
+   "outputs": [],
-    }
+   "source": [
-   ],
+    "def get_omop_concepts(cur, codes, vocab_id):    \n",
-   "source": []
+    "    #Create List for SQL\n",
+    "    mask = \"\"\n",
+    "    for c in codes:\n",
+    "        mask+=f'\"{c}\", '\n",
+    "    mask = mask[:-2] #remove last comma\n",
+    "    \n",
+    "    #Execute SQL\n",
+    "    cur.execute(f'SELECT * FROM CONCEPT WHERE concept_code IN ({mask}) AND VOCABULARY_ID = \"{vocab_id}\";')\n",
+    "    df = pd.DataFrame(cur.fetchall()) #convert to pandas df\n",
+    "    \n",
+    "    print(\"Identified\", len(df[0]) ,\"OMOP Concepts:\", list(df[0]))\n",
+    "    \n",
+    "    return df\n",
+    "    \n",
+    "def omop_concepts_to_atlas_json(df):\n",
+    "    json = []\n",
+    "    for i, row in df.iterrows():\n",
+    "        #template for atlas api\n",
+    "        out = { \n",
+    "            \"concept\": {\n",
+    "                'CONCEPT_ID': row[0],\n",
+    "                'CONCEPT_NAME': row[1],\n",
+    "                'STANDARD_CONCEPT': 'S',\n",
+    "                'STANDARD_CONCEPT_CAPTION': 'Standard',\n",
+    "                'INVALID_REASON': 'V',\n",
+    "                'INVALID_REASON_CAPTION': 'Valid',\n",
+    "                'CONCEPT_CODE': row[6],\n",
+    "                'DOMAIN_ID': row[2],\n",
+    "                'VOCABULARY_ID': row[3],\n",
+    "                'CONCEPT_CLASS_ID': row[4],\n",
+    "                'VALID_START_DATE': int(row[7]),\n",
+    "                'VALID_END_DATE': int(row[8])\n",
+    "            },\n",
+    "            'isExcluded': False,\n",
+    "            'includeDescendants': False,\n",
+    "            'includeMapped': False\n",
+    "        }\n",
+    "        json.append(out)\n",
+    "    return json \n",
+    "\n",
+    "conn = sqlite3.connect(\"codes/omop_54.sqlite\")\n",
+    "cur = conn.cursor()\n",
+    "\n",
+    "vocab_id=\"SNOMED\" #SNOMED, ATC, ICD10CM, ICD9CM, Read\n",
+    "csv_output = \"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/ANGER.csv\"\n",
+    "\n",
+    "#Load CSV Output File\n",
+    "df_in = pd.read_csv(csv_output)\n",
+    "print(len(df_in))\n",
+    "\n",
+    "# df = get_omop_concepts(cur, [\"119768002\", \"5905001\"], \"SNOMED\")\n",
+    "df = get_omop_concepts(cur, list(df_in[\"code\"]), vocab_id)\n",
+    "json = omop_concepts_to_atlas_json(df)\n",
+    "# display(json)\n",
+    "\n",
+    "conn.close()"
+   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 164,
+   "execution_count": null,
-   "id": "b3166cf0-e4a5-43e0-aeac-78827427422e",
+   "id": "ea759907-c085-472a-82e2-07b6b19e2c8f",
   "metadata": {},
-   "outputs": [
+   "outputs": [],
+   "source": [
+    "#ATLAS GET CONCEPT SET\n",
+    "import requests\n",
+    "\n",
+    "def request_get(url):\n",
+    "    try:\n",
+    "        # Sending the GET request\n",
+    "        response = requests.get(url)\n",
+    "\n",
+    "        # Check if the response status code is 200 (OK)\n",
+    "        if response.status_code == 200:\n",
+    "            print(\"Response data:\")\n",
+    "            # print(response.json())  # Assuming the response is in JSON format\n",
+    "            return response.json()\n",
+    "        else:\n",
+    "            print(f\"Failed to fetch data. HTTP Status Code: {response.status_code}\")\n",
+    "            print(\"Response content:\")\n",
+    "            print(response.text)\n",
+    "            return None\n",
+    "\n",
+    "    except requests.exceptions.RequestException as e:\n",
+    "        print(f\"An error occurred: {e}\")\n",
+    "\n",
+    "\n",
+    "#GET SET INFO\n",
+    "set_id = \"1885449\"\n",
+    "url = f\"https://atlas-demo.ohdsi.org/WebAPI/conceptset/{set_id}\"\n",
+    "request_get(url)"
+   ]
+  },
  {
-     "data": {
+   "cell_type": "code",
-      "text/plain": [
+   "execution_count": null,
-       "BNF Code       object\n",
+   "id": "5a70e636-6051-4930-bf1b-30d093fd0552",
-       "SNOMED Code    object\n",
+   "metadata": {},
-       "dtype: object"
+   "outputs": [],
+   "source": [
+    "#GET SET ITEMS (Concepts)\n",
+    "set_id = \"1885449\"\n",
+    "url = f\"https://atlas-demo.ohdsi.org/WebAPI/conceptset/{set_id}/expression/ATLASPROD\"\n",
+    "response = request_get(url)\n",
+    "display(response)"
   ]
  },
-     "execution_count": 164,
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "96bfcd9c-27e8-4be4-a680-7553d908790e",
   "metadata": {},
-     "output_type": "execute_result"
+   "outputs": [],
-    }
-   ],
   "source": [
-    ".astype(str).dtypes"
+    "#ATLAS CREATE CONCEPT SET\n"
   ]
  }
 ],

 %% Cell type:code id:8c8f4cdf-04a5-4762-895e-6555781a1f05 tags:
 ``` python
 import pandas as pd
 import numpy as np
 import json
 ```
 %% Cell type:markdown id:c5786d78-7dc2-4f02-ad21-cee95e473823 tags:
 ### Ho generate JSON
 %% Cell type:code id:0292dc90-e31a-4724-8536-d0b55533aaef tags:
 ``` python
 #List v4 to json
 df = pd.read_excel("PHEN_code_lists_sources_V4.xlsx", sheet_name="ho", dtype=str)
 # df = df.sort_values(by="mapped_condition")
 def json_file_template(file, cons, types, metadata):
    concepts = ""
    for concept in cons:
        concepts += f'"{concept}", '
    concepts = concepts[:-2] #remove last ,
    type_str = ""
    for k, v in types.items():
        type_str += f'"{k}":"{v}", '
    type_str = type_str[:-2]
    meta_str = '"metadata":['
    for v in metadata:
        meta_str += f'"{v}", '
    meta_str = meta_str[:-2]
    meta_str = meta_str + "]"
    return '''
    {
        \"file\":\"'''+file+'''",
        \"columns\":{
            '''+type_str+''',
            '''+meta_str+'''
        },
        \"meldb_phenotypes\":['''+concepts+''']
    },'''
 out = '"files":['
 folder = "codes/GitHub_TG_repository/"
 for file, grp in df.groupby("mapped_condition"):
    file = file.replace("%20", " ")
    for ext in ["_CPRD_GOLD.csv", "_CPRD_AURUM.csv", "_IMRD.csv"]:
        path = file+"/"+file+ext
        if os.path.isfile(folder+path):
            out+= json_file_template(path, grp["meldb_condition"],
                                     types={
                                         "read2_code":"READ_CODE",
                                         "snomed_code":"SNOMED_CT_CODE",
                                         # "med_code":"MEDICAL_CODE_ID",
                                     },
                                     metadata = ["DESCRIPTION"]
                                    )
        else:
            print("NOT FILE", folder+path)
    for ext in ["_ICD10.csv"]:
        path = file+"/"+file+ext
        if os.path.isfile(folder+path):
            out+= json_file_template(path, grp["meldb_condition"],
                                     types={
                                         "icd10_code":"READ_CODE",
                                         "snomed_code":"SNOMED_CT_CODE",
                                         # "icd10_code":"MEDICAL_CODE_ID",
                                     },
                                     metadata = ["DESCRIPTION"]
                                    )
        else:
            print("NOT FILE", folder+path)
    # out+= json_file_template(file+"/"+file+"_CPRD_AURUM.csv", grp["meldb_condition"])
    # out+= json_file_template(file+"/"+file+"_ICD10.csv", grp["meldb_condition"])
    # out+= json_file_template(file+"/"+file+"_IMRD.csv", grp["meldb_condition"])
    # out += f' "{file}/{file}_CPRD_GOLD.csv":[{conds}],\n'
    # out += f' "{file}/{file}_CPRD_AURUM.csv":[{conds}],\n'
    # out += f' "{file}/{file}_ICD10.csv":[{conds}],\n'
    # out += f' "{file}/{file}_IMRD.csv":[{conds}],\n'
 out = out[:-1] #remove last ,
 out += "\n]"
 out = out.replace("%20", " ")
 print(out)
 ```
 %% Cell type:code id:f155b635-b459-4aff-81b2-e065fc223858 tags:
 ``` python
 ```
-%% Output
-    0    False
-    dtype: bool
 %% Cell type:code id:d040eda5-4028-4047-834c-7315e307e415 tags:
 ``` python
 df = pd.read_parquet("maps/processed/icd10_code.parquet")
 df
 ```
-%% Output
-          icd10_code icd10_alt_code  \
-    0            A00            A00
-    1          A00.0           A000
-    2          A00.1           A001
-    3          A00.9           A009
-    4            A01            A01
-    ...          ...            ...
-    17929      U84.3           U843
-    17930      U84.7           U847
-    17931      U84.8           U848
-    17932      U84.9           U849
-    17933        U85           U85X
-                                                 description
-    0                                                Cholera
-    1      Cholera due to Vibrio cholerae 01, biovar chol...
-    2        Cholera due to Vibrio cholerae 01, biovar eltor
-    3                                   Cholera, unspecified
-    4                         Typhoid and paratyphoid fevers
-    ...                                                  ...
-    17929              Resistance to tuberculostatic drug(s)
-    17930         Resistance to multiple antimicrobial drugs
-    17931   Resistance to other specified antimicrobial drug
-    17932      Resistance to unspecified antimicrobial drugs
-    17933                 Resistance to antineoplastic drugs
-    [17934 rows x 3 columns]
 %% Cell type:code id:e0228ac9-8852-4818-b7f0-98429ca5229c tags:
 ``` python
 code = ["A00.0", "*00.0"]
 code = pd.Series(code)
 print(code.isin(df["icd10_code"]))
 print(code.isin(df["icd10_alt_code"]))
 # print(  )
 ~(
    ~code.isin(df["icd10_code"])
    &
    ~code.isin(df["icd10_alt_code"])
 )
 ```
-%% Output
-    0     True
-    1    False
-    dtype: bool
-    0    False
-    1    False
-    dtype: bool
-    0     True
-    1    False
-    dtype: bool
 %% Cell type:markdown id:18efcacd-45f0-4341-86cc-d8e2e584350c tags:
 ### Analyse the JSON file
+%% Cell type:code id:85dc197b-451e-4fa9-a53b-e6770c132123 tags:
+``` python
+import json
+import os
+path_json = "../concepts/PHEN_assign_v3.json"
+#Load JSON Concept Definitions
+mapping = json.load(open(path_json,'rb'))
+summary_config = mapping["concept_sets"]["concept_set"]
+summary_df = pd.DataFrame(summary_config) #change to dataframe
+summary_df = summary_df.join(pd.json_normalize(summary_df["metadata"])) #metadata to columns
+summary_df = summary_df.drop(columns=["metadata"])
+summary_df = summary_df.rename(columns={"concept_set_name":"CONCEPT_SET"})
+summary_df = summary_df.drop_duplicates() #remove duplicates
+summary_df
+```
 %% Cell type:code id:4c9b6b3f-08aa-4f61-b9b2-44a24b5d00a0 tags:
 ``` python
 import json
 import os
 path_json = "PHEN_assign_v3.json"
 path_excel = "PHEN_summary_working.xlsx"
 path_codes = "codes/"
 #Get all Files in JSON
 def get_json_files(path_json):
    folders = json.load(open(path_json,'rb'))
    out = []
    for folder in folders:
        if "files" in folder:
            for file in folder["files"]:
                file_path = folder["folder"]+"/"+file["file"]
                if "meldb_phenotypes" in file:
                    for concept in file["meldb_phenotypes"]:
                        out.append({"json_concept":concept, "filepath":file_path, "json_code_types":list(file["columns"].keys())})
                elif "meldb_phenotypes_categories" in file:
                    for code, concept in file["meldb_phenotypes_categories"].items():
                        out.append({"json_concept":concept[0], "filepath":file_path, "json_code_types":list(file["columns"].keys())})
                else:
                    out.append({"json_concept":None, "filepath":file_path})
    out = pd.DataFrame(out)
    out["filepath"] = out["filepath"].astype(str)
    return out
 out = get_json_files(path_json)
 #Get all Files Excel Summary
 def get_excel_files(path_excel):
    path_excel = "PHEN_summary_working.xlsx"
    out2 = pd.read_excel(path_excel)
    out2 = out2[["CONCEPT NAME ", "CODING LIST", "AGREED", "FUNCTION"]].loc[1:] #select relevant columns
    #Filter Concepts in use
    out2 = out2[out2["AGREED"] == "USE"] #remove deprecated concepts
    out2 = out2[out2["FUNCTION"] == "QUERY BY CODING LIST"] #remove deprecated concepts
    out2 = out2.drop(['AGREED', 'FUNCTION'], axis=1)
    #Get filepaths
    out2["CODING LIST"] = out2["CODING LIST"].str.split(",") #split by ,
    out2 = out2.explode("CODING LIST") #one row per file
    out2["CODING LIST"] = out2["CODING LIST"].str.strip()
    out2["CODING LIST"] = out2["CODING LIST"].str.replace("https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/", "")
    out2["CODING LIST"] = out2["CODING LIST"].str.replace("%20", " ")
    out2 = out2.rename(columns={"CONCEPT NAME ":"excel_concept", "CODING LIST":"filepath"})
    return out2
 out2 = get_excel_files(path_excel)
 #Get all Files in /codes
 def get_code_files(path_codes):
    all_files = []
    for root, dirs, files in os.walk(path_codes, topdown=False):
        for name in files:
            if ".ipynb_checkpoint" not in root: #exclude notebook checkpoints
                if name.endswith(".csv") or name.endswith(".xlsx") or name.endswith(".dta"): #exclude non-data files
                    all_files.append(os.path.join(root, name))
    all_files = pd.DataFrame(all_files)
    all_files = all_files.rename(columns={0:"filepath"})
    all_files["filepath"] = all_files["filepath"].astype(str)
    return all_files
 all_files = get_code_files(path_codes)
 print("ALL FILES", len(all_files), len(all_files["filepath"].unique()))
 print("JSON CONCEPTS", len(out), len(out["filepath"].unique()))
 print("EXCEL CONCEPTS", len(out2), len(out2["filepath"].unique()))
 outs = pd.merge(all_files, out, how="outer", on="filepath")
 outs = pd.merge(outs, out2, how="outer", on="filepath")
 print(len(outs), len(outs["filepath"].unique()))
 outs.to_csv("output/MELD_file_to_concept.csv", index=False)
 # display(outs[ outs["concept"].isna()])
 # display(out )
 ```
-%% Output
-    ALL FILES 878 878
-    JSON CONCEPTS 436 397
-    EXCEL CONCEPTS 440 397
-    1755 878
-    /opt/conda/lib/python3.9/site-packages/openpyxl/worksheet/_reader.py:329: UserWarning: Data Validation extension is not supported and will be removed
-      warn(msg)
 %% Cell type:code id:f8e70c33-c869-46f8-953e-f6b52992cfbb tags:
 ``` python
 display("JSON MISSING", outs[outs["json_concept"].isna() & outs["excel_concept"].notna()])
 display("EXCEL MISSING", outs[outs["json_concept"].notna() & outs["excel_concept"].isna()])
 ```
-%% Output
 %% Cell type:code id:9d84465f-f064-4df2-b0e4-2dfb217aea21 tags:
 ``` python
 f =  open('concepts-output/MELD-report.md', 'a') as f:
    f.write(
    """
 # Report
 - One thing
 - Two thing
 - Three thing
    """)
 ```
 %% Cell type:code id:7f7fc771-e406-42c7-8a09-16a20b5298f5 tags:
 ``` python
 total_length = 0
 for file in all_files["filepath"]:
    if file.endswith(".csv"):
        df_file = pd.read_csv(file)
        total_length += len(df_file)
    elif file.endswith(".xlsx"):
        df_file = pd.read_excel(file)
        total_length += len(df_file)
    elif file.endswith(".dta"):
        df_file = pd.read_stata(file)
        total_length += len(df_file)
 total_length
 ```
-%% Output
-    65307
 %% Cell type:code id:08a9c565-28d6-46ee-9fa8-6fa0ee28a4d5 tags:
 ``` python
 #turn filepaths into gitlab links
 outs2 = outs.copy()
 outs2["filepath"] = "https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/"+outs2["filepath"].str.replace(" ", "%20")
 #Groupby concepts and concat filepaths
 outs2 = outs2.groupby("concept")["filepath"].apply(', '.join).reset_index()
 outs2 = outs2.sort_values(by=["concept"])
 outs2
 outs2.to_csv("output/MELD_GitLab_link_to_concept.csv", index=False)
 ```
 %% Cell type:markdown id:357bb84c-90c2-4b5f-95c0-443191783a7f tags:
 ### Analyse Output Files
 %% Cell type:code id:7d3f9cb7-be86-4f1f-92f6-991094eb7bb7 tags:
 ``` python
 version = "V2_2_2"
 output_files = [f"output/{version}_MELD_concepts_readv2.csv",
                f"output/{version}_MELD_snomed_no_translate.csv",
                f"output/{version}_MELD_icd10_no_translate.csv",
                # f"output/{version}_MELD_med_no_translate.csv",
                f"output/{version}_MELD_atc_no_translate.csv"
               ]
 error_file = f"output/{version}_MELD_errors.csv"
 for output_file in output_files:
    print("---"*3,output_file,"---"*3,)
    df = pd.read_csv(output_file)
    # df["MELDB_concept"].loc[df["CONCEPT TYPE"].isna()]
    print("MELDB missing concepts ", len(df[df["CONCEPT TYPE"].isna()]))
    if df["code"].dtype == "object":
        print("Chars present:", np.sort(df["code"].apply(lambda x : set(x)).explode().unique()))
 # len(df["MELDB_concept"].unique())
 print("---"*3,error_file,"---"*3,)
 df = pd.read_csv(error_file)
 df = df.drop_duplicates()
 df["CODE_TYPE"].value_counts()
 # for i, row in df.drop_duplicates().iterrows():
 #     print(row["CODE"], row["CODE_TYPE"])
 ```
-%% Output
-    --------- output/V2_2_2_MELD_concepts_readv2.csv ---------
-    MELDB missing concepts  0
-    Chars present: ['.' '0' '1' '2' '3' '4' '5' '6' '7' '8' '9' 'A' 'B' 'C' 'D' 'E' 'F' 'G'
-     'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' 'X' 'Y'
-     'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q'
-     'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']
-    --------- output/V2_2_2_MELD_snomed_no_translate.csv ---------
-    MELDB missing concepts  0
-    --------- output/V2_2_2_MELD_icd10_no_translate.csv ---------
-    MELDB missing concepts  0
-    Chars present: ['0' '1' '2' '3' '4' '5' '6' '7' '8' '9' 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H'
-     'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'T' 'W' 'X' 'Y' 'Z']
-    --------- output/V2_2_2_MELD_atc_no_translate.csv ---------
-    MELDB missing concepts  0
-    Chars present: ['0' '1' '2' '3' '6' 'A' 'F' 'N' 'X']
-    --------- output/V2_2_2_MELD_errors.csv ---------
-    CODE_TYPE
-    snomed_code    1261
-    read2_code      464
-    read3_code       80
-    icd10_code        1
-    Name: count, dtype: int64
 %% Cell type:code id:08e0ecc1-9271-48c3-9c5b-094800072906 tags:
 ``` python
 def get_output_files(version):
    output_files = [f"output/{version}_MELD_concepts_readv2.csv",
                f"output/{version}_MELD_snomed_no_translate.csv",
                f"output/{version}_MELD_icd10_no_translate.csv",
                # f"output/{version}_MELD_med_no_translate.csv",
                f"output/{version}_MELD_atc_no_translate.csv"
               ]
    error_file = f"output/{version}_MELD_errors.csv"
    return output_files, error_file
 # version_1 = "V1_0_0"
 version_1 = "V2_1_4"
 version_2 = "V2_2_3"
 output1, err1 = get_output_files(version_1)
 output2, err2 = get_output_files(version_2)
 print("## Compare Concepts", version_1, "to", version_2)
 for out1, out2 in zip(output1, output2):
    print(out1, out2 )
    df1 = pd.read_csv(out1)
    df1 = df1[["code","MELDB_concept"]].groupby("MELDB_concept").count()
    df2 = pd.read_csv(out2)
    df2 = df2[["code","MELDB_concept"]].groupby("MELDB_concept").count()
    #Added/Removed Concepts
    print("- Removed Concepts", list(set(df1.index) - set(df2.index)))
    print("- Added Concepts",  list(set(df2.index) - set(df1.index)))
    #Changed Concepts
    diff = df2 - df1 #diff in counts
    diff = diff[(~(diff["code"] == 0.0)) & diff["code"].notna()] #get non-zero counts
    s = "\n"
    for concept, row in diff.iterrows():
        s += "\t - {} {}\n".format(concept, row["code"])
    print("- Changed Concepts", s)
 # for output_file in output_files:
 #     print("---"*3,output_file,"---"*3,)
 #     df = pd.read_csv(output_file)
 #     # df["MELDB_concept"].loc[df["CONCEPT TYPE"].isna()]
 #     print("MELDB missing concepts ", len(df[df["CONCEPT TYPE"].isna()]))
 #     if df["code"].dtype == "object":
 #         print("Chars present:", np.sort(df["code"].apply(lambda x : set(x)).explode().unique()))
 ```
-%% Output
+%% Cell type:code id:cc60c137-5a85-4155-af6b-6796f8c05980 tags:
+``` python
+import glob
+import os
+import pandas as pd
+df = pd.read_csv("/home/jjd1c23/ssd/meldb/jjd1c23/concepts/PHEN_summary_working.csv")
+df = df.set_index("#")
-    ## Compare Concepts V2_1_4 to V2_2_3
+for vocab in ["atc", "icd10", "readv2", "snomed"]:
-    output/V2_1_4_MELD_concepts_readv2.csv output/V2_2_3_MELD_concepts_readv2.csv
+    df[vocab.upper()] = ""
-    - Removed Concepts ['THYROID_DISEASE', 'SCHIZOPHRENIA_BIPOLAR_DISORDER', 'PSIORIASIS_ECZEMA', 'HAEMATOLOGICAL_CANCERS', 'INFLAMM_ARTHROPATHIES', 'ALL_CANCER', 'STROKE_TIA', 'DIABETES', 'PMR_AND_GCD', 'LONG_TERM_MS_PROBLEMS', 'ALL_CKD', 'INFLAMM_ARTHROPATHIES_CONNECTIVE_TISSUE_DIS', 'RENAL_TRANSPLANT_DIALYSIS']
-    - Added Concepts []
+    for file in glob.glob(f"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/{vocab}/*.csv"):
-    - Changed Concepts
+        concept_set = os.path.basename(file)[:-4]
-    	 - ANXIETY -7.0
+        row_index = df[df["CONCEPT NAME "] == concept_set].index[0]
-    	 - ARRHYTHMIA -1.0
-    	 - ASTHMA -1.0
+        df.loc[row_index, vocab.upper()] = "YES"
-    	 - AUTISM_AND_ADHD -4.0
-    	 - BIPOLAR_DISORDER -1.0
+df = df.drop(columns=["READv2_CODE", "ICD10_CODE"])
-    	 - BLINDNESS_AND_LOW_VISION -3.0
+df.to_csv("/home/jjd1c23/ssd/meldb/jjd1c23/concepts/PHEN_summary_working_labelled.csv")
-    	 - COELIAC_DISEASE -1.0
+```
-    	 - CORONARY_HEART_DISEASE -8.0
-    	 - DEAFNESS -33.0
-    	 - DEMENTIA_ALZHEIMER -2.0
-    	 - DEPRESSION -5.0
-    	 - DIABETES_T1 -1.0
-    	 - DIABETES_T2 -1.0
-    	 - DIALYSIS -14.0
-    	 - DIVERTICULAR_DISEASE -11.0
-    	 - DRUG_ALCOHOL_MISUSE -3.0
-    	 - EATING_DISORDERS -2.0
-    	 - EPILEPSY -1.0
-    	 - FATIGUE -27.0
-    	 - HEADACHE -48.0
-    	 - HF -3.0
-    	 - INCONTINENCE -21.0
-    	 - LEARNING_DISABILITY -3.0
-    	 - MSK_PAIN -36.0
-    	 - MULTIPLE_SCLEROSIS -1.0
-    	 - PALLIATIVE_CARE -8.0
-    	 - PLASMACELL -1.0
-    	 - PTSD -1.0
-    	 - SCHIZOPHRENIA -1.0
-    	 - SELF_HARM -37.0
-    	 - SLEEP_PROBLEMS -74.0
-    	 - STRESS -31.0
-    	 - SYSTEMIC_LUPUS_ERYTHEMATOSUS -2.0
-    output/V2_1_4_MELD_snomed_no_translate.csv output/V2_2_3_MELD_snomed_no_translate.csv
-    - Removed Concepts ['THYROID_DISEASE', 'SCHIZOPHRENIA_BIPOLAR_DISORDER', 'PSIORIASIS_ECZEMA', 'HAEMATOLOGICAL_CANCERS', 'INFLAMM_ARTHROPATHIES', 'ALL_CANCER', 'STROKE_TIA', 'DIABETES', 'PMR_AND_GCD', 'LONG_TERM_MS_PROBLEMS', 'ALL_CKD', 'INFLAMM_ARTHROPATHIES_CONNECTIVE_TISSUE_DIS', 'RENAL_TRANSPLANT_DIALYSIS']
-    - Added Concepts []
-    - Changed Concepts
-    	 - ANAEMIA -2.0
-    	 - ANEURYSM -3.0
-    	 - ANXIETY -7.0
-    	 - ARRHYTHMIA -25.0
-    	 - ASTHMA -34.0
-    	 - ATOPIC_ECZEMA -6.0
-    	 - AUTISM_AND_ADHD -2.0
-    	 - BIPOLAR_DISORDER -3.0
-    	 - BLINDNESS_AND_LOW_VISION -4.0
-    	 - BREAST_CANCER -2.0
-    	 - BRONCHIECSTASIS -1.0
-    	 - CHRONIC_BACK_PAIN -1.0
-    	 - CHRONIC_FATIGUE_SYNDROME -3.0
-    	 - CHRONIC_LIVER_DISEASE -14.0
-    	 - CHRONIC_PAIN -2.0
-    	 - CKD_STAGE3_5 -3.0
-    	 - COELIAC_DISEASE -6.0
-    	 - COLON_CANCER -6.0
-    	 - CONGENITAL_DIS_CHROMOSOMAL_ABNORMALITIES -1.0
-    	 - COPD -31.0
-    	 - CORONARY_HEART_DISEASE -21.0
-    	 - CYSTIC_FIBROSIS -24.0
-    	 - DEAFNESS -15.0
-    	 - DEMENTIA_ALZHEIMER -111.0
-    	 - DEPRESSION -34.0
-    	 - DIABETES_T2 -2.0
-    	 - DIABETIC_RETINOPATHY -13.0
-    	 - DIALYSIS -1.0
-    	 - DIVERTICULAR_DISEASE -4.0
-    	 - DRUG_ALCOHOL_MISUSE -310.0
-    	 - EATING_DISORDERS -4.0
-    	 - ENDOMETRIOSIS -1.0
-    	 - EPILEPSY -11.0
-    	 - GLAUCOMA -3.0
-    	 - GOUT -4.0
-    	 - HEART_VALVE_DISORDERS -6.0
-    	 - HF -4.0
-    	 - HIVAIDS -18.0
-    	 - HYPERTENSION -11.0
-    	 - HYPERTHYROIDISM -1.0
-    	 - HYPOTHYROIDISM -8.0
-    	 - IBD -2.0
-    	 - ILD -2.0
-    	 - LEARNING_DISABILITY -40.0
-    	 - LEUKAEMIA -1.0
-    	 - LYMPHOMA -2.0
-    	 - MENIERES_DISEASE -1.0
-    	 - METASTATIC_CANCER -3.0
-    	 - MOBILITY_PROBLEMS -45.0
-    	 - MULTIPLE_SCLEROSIS -13.0
-    	 - OBESITY -63.0
-    	 - OSTEOARTHRITIS -3.0
-    	 - OSTEOPOROSIS -4.0
-    	 - PARALYSIS -3.0
-    	 - PARKINSONS -2.0
-    	 - PLASMACELL -1.0
-    	 - PROSTATE_CANCER -2.0
-    	 - PROSTATE_DISORDERS -2.0
-    	 - PSORIASIS -3.0
-    	 - PTSD -38.0
-    	 - RENAL_TRANSPLANT -1.0
-    	 - RHEUMATOID_ARTHRITIS -8.0
-    	 - SCHIZOPHRENIA -85.0
-    	 - SKIN_CANCER -4.0
-    	 - STROKE -4.0
-    	 - SYSTEMIC_LUPUS_ERYTHEMATOSUS -1.0
-    	 - TIA -1.0
-    	 - VIRAL_HEPATITIS -9.0
-    	 - VTD -5.0
-    output/V2_1_4_MELD_icd10_no_translate.csv output/V2_2_3_MELD_icd10_no_translate.csv
-    - Removed Concepts ['THYROID_DISEASE', 'SCHIZOPHRENIA_BIPOLAR_DISORDER', 'PSIORIASIS_ECZEMA', 'HAEMATOLOGICAL_CANCERS', 'INFLAMM_ARTHROPATHIES', 'ALL_CANCER', 'STROKE_TIA', 'DIABETES', 'PMR_AND_GCD', 'LONG_TERM_MS_PROBLEMS', 'ALL_CKD', 'INFLAMM_ARTHROPATHIES_CONNECTIVE_TISSUE_DIS']
-    - Added Concepts []
-    - Changed Concepts
-    	 - CVD_EVENTS -1.0
-    output/V2_1_4_MELD_atc_no_translate.csv output/V2_2_3_MELD_atc_no_translate.csv
-    - Removed Concepts []
-    - Added Concepts []
-    - Changed Concepts
 %% Cell type:markdown id:e5c4291f-847b-4c82-976e-bd5b3a7b6bcc tags:
 ###  Mappings
 %% Cell type:code id:08e34750-413c-469e-bcb8-e71bb188ff42 tags:
 ``` python
 #NHS Read Browser
 import simpledbf
 import pandas as pd
 #r2 only
 df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ANCESTOR.DBF').to_dataframe()
 df = pd.concat([df['READCODE'], df['DESCENDANT']])
 df = pd.DataFrame(df.drop_duplicates())
 df = df.rename(columns={0:"read2_code"})
 df.to_parquet("maps/processed/read2_code.parquet", index=False)
 #r2 -> atc
 df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ATC.DBF').to_dataframe()
 df = df[["READCODE", "ATC"]]
 df = df.rename(columns={"READCODE":"read2_code", "ATC":"atc_code"})
 df.to_parquet("maps/processed/read2_code_to_atc_code.parquet", index=False)
 #r2 -> icd10
 df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ICD10.DBF').to_dataframe()
 df = df[["READ_CODE", "TARG_CODE"]]
 df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"icd10_code"})
 df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-'
 df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-'
 df.to_parquet("maps/processed/read2_code_to_icd10_code.parquet", index=False)
 #r2 -> opcs4
 df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/OPCS4V3.DBF').to_dataframe()
 df = df[["READ_CODE", "TARG_CODE"]]
 df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"opcs4_code"})
 df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-'
 df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-'
 df.to_parquet("maps/processed/read2_code_to_opcs4_code.parquet", index=False)
 #r3 only
 df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ANCESTOR.DBF').to_dataframe()
 df = pd.concat([df['READCODE'], df['DESCENDANT']])
 df = pd.DataFrame(df.drop_duplicates())
 df = df.rename(columns={0:"read3_code"})
 df.to_parquet("maps/processed/read3_code.parquet", index=False)
 #r3 -> icd10
 df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ICD10.DBF').to_dataframe()
 df = df[["READ_CODE", "TARG_CODE"]]
 df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"icd10_code"})
 df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-'
 df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-'
 df.to_parquet("maps/processed/read3_code_to_icd10_code.parquet", index=False)
 #r3 -> icd9
 # dbf = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ICD9V3.DBF')
 #r3 -> opcs4
 df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/OPCS4V3.DBF').to_dataframe()
 df = df[["READ_CODE", "TARG_CODE"]]
 df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"opcs4_code"})
 df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-'
 df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-'
 df.to_parquet("maps/processed/read3_code_to_opcs4_code.parquet", index=False)
 ```
 %% Cell type:code id:5fe95638-1f25-45f3-803c-2fff74a2a4fd tags:
 ``` python
 #NHS Data Migrations
 #r2 only
 # df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rctcremap_uk_20200401000001.txt', sep='\t')
 #r3 only
 # df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3cremap_uk_20200401000001.txt', sep='\t')
 #snomed only
 df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/sctcremap_uk_20200401000001.txt', sep='\t')
 df = df[["SCT_CONCEPTID"]]
 df = df.rename(columns={"SCT_CONCEPTID":"snomed_code"})
 df = df.drop_duplicates()
 df = df.astype(str)
 df.to_parquet("maps/processed/snomed_code.parquet", index=False)
 #r2 -> r3
 df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rctctv3map_uk_20200401000001.txt', sep='\t')
 df = df[["V2_CONCEPTID", "CTV3_CONCEPTID"]]
 df = df.rename(columns={"V2_CONCEPTID":"read2_code",
                        "CTV3_CONCEPTID":"read3_code"})
 df.to_parquet("maps/processed/read2_code_to_read3_code.parquet", index=False)
 #r3->r2
 df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3rctmap_uk_20200401000002.txt', sep='\t')
 df = df[["CTV3_CONCEPTID", "V2_CONCEPTID"]]
 df = df.rename(columns={"CTV3_CONCEPTID":"read3_code",
                        "V2_CONCEPTID":"read2_code"})
 df = df.drop_duplicates()
 df = df[~df["read2_code"].str.match("^.*_.*$")] #remove r2 codes with '_'
 df.to_parquet("maps/processed/read3_code_to_read2_code.parquet", index=False)
 #r2 -> snomed
 df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rcsctmap2_uk_20200401000001.txt', sep='\t', dtype=str)
 df = df[["ReadCode", "ConceptId"]]
 df = df.rename(columns={"ReadCode":"read2_code",
                        "ConceptId":"snomed_code"})
 df.to_parquet("maps/processed/read2_code_to_snomed_code.parquet", index=False)
 #r3->snomed
 df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3sctmap2_uk_20200401000001.txt', sep='\t')
 df = df[["CTV3_TERMID", "SCT_CONCEPTID"]]
 df = df.rename(columns={"CTV3_TERMID":"read3_code",
                        "SCT_CONCEPTID":"snomed_code"})
 df["snomed_code"] = df["snomed_code"].astype(str)
 df = df[~df["snomed_code"].str.match("^.*_.*$")] #remove snomed codes with '_'
 df.to_parquet("maps/processed/read3_code_to_snomed_code.parquet", index=False)
 ```
 %% Cell type:code id:267fa1cc-5159-48c4-9eee-19af5039d627 tags:
 ``` python
 #OPCS410 Data Files
 df = pd.read_csv("maps/OPCS410 Data files txt/OPCS410 CodesAndTitles Nov 2022 V1.0.txt", sep='\t', dtype=str, header=None)
 df = df.rename(columns={0:"opcs4_code", 1:"description"})
 df.to_parquet("maps/processed/opcs4_code.parquet", index=False)
 ```
 %% Cell type:code id:01d046fd-69af-44f3-acad-5d0edef3f745 tags:
 ``` python
 #ICD10_edition5
 df = pd.read_xml("maps/ICD10_Edition5_XML_20160401/Content/ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml",)
 df = df[["CODE", "ALT_CODE", "DESCRIPTION"]]
 df = df.rename(columns={"CODE":"icd10_code",
                        "ALT_CODE":"icd10_alt_code",
                        "DESCRIPTION":"description"
                       })
 df.to_parquet("maps/processed/icd10_code.parquet", index=False)
 ```
 %% Cell type:code id:36630e24-f56c-48e1-8ecf-4ccd2b41eaea tags:
 ``` python
 code1="read2_code"
 code2="icd10_code"
 df_map = pd.read_parquet(f"maps/processed/{code1}_to_{code2}.parquet")
 codes=df_map["read2_code"].iloc[:5]
 pd.merge(codes, df_map, how='left')[code2]
 ```
 %% Cell type:code id:9787adeb-8507-488b-9a91-b8df3fbbe21e tags:
 ``` python
 #CPRD Code Browser
 df = pd.read_csv('maps/CPRD_CodeBrowser_202211_Aurum/CPRDAurumMedical.txt', sep='\t')
 df = df[["MedCodeId", "CleansedReadCode", "SnomedCTConceptId"]]
 df = df.rename(columns={"MedCodeId":"med_code",
                   "CleansedReadCode":"read2_code",
                   "SnomedCTConceptId":"snomed_code"})
 # df = pd.read_csv('maps/CPRD_CodeBrowser_202211_Aurum/CPRDAurumProduct.txt', sep='\t', dtype=str)
 # df = pd.read_csv('maps/CPRD_CodeBrowser_202211_GOLD/medical.txt', sep='\t')
 # df = df.reset_index().iloc[:,[1,6]]
 # df = df.rename(columns={"level_1":"read2_code", "20220523":"description"})
 # df = pd.read_csv('maps/CPRD_CodeBrowser_202211_GOLD/product.txt', sep='\t', dtype=str) #CANNOT OPEN
 df
 ```
 %% Cell type:code id:a968ffb1-4337-456b-8d20-419888b4044f tags:
 ``` python
 #BNF
 df = pd.read_excel("maps/BNF Snomed Mapping data 20231215.xlsx")
 df = df.astype(str)
 df = df.rename(columns={"BNF Code":"bnf_code",
                        "SNOMED Code":"snomed_code"})
 df[["bnf_code", "snomed_code"]].to_parquet("maps/processed/bnf_code_to_snomed_code.parquet", index=False)
 ```
 %% Cell type:code id:c70b1ce2-0f41-4d02-ad17-6fc44bc3c6bf tags:
 ``` python
 #BNF to Readv2 Merge
 df1 = pd.read_parquet("maps/processed/bnf_code_to_snomed_code.parquet").astype(str)
 df2 = pd.read_parquet("maps/processed/read2_code_to_snomed_code.parquet").astype(str)
 # df1.merge(df2, how="inner", on="snomed_code")
 ```
 %% Cell type:code id:d5d34237-02d4-4dea-8c20-5adaf337f6b5 tags:
 ``` python
 df1.merge(df2, how='inner', on='snomed_code')
 ```
-%% Output
+%% Cell type:code id:b3166cf0-e4a5-43e0-aeac-78827427422e tags:
-    Empty DataFrame
+``` python
-    Columns: [bnf_code, snomed_code, read2_code]
+.astype(str).dtypes
-    Index: []
+```
-%% Cell type:code id:d0cbadfe-ef55-40a8-a0f1-a9fc69d7456b tags:
+%% Cell type:code id:c0a766f9-7959-4a10-b58f-cd946a878b60 tags:
 ``` python
+df = pd.read_csv("../concepts/PHEN_summary_working.csv")
+cols = list(df.columns)
+cols.remove('CONCEPT NAME ')
+cols.remove('AGREED')
+df = df.applymap(lambda x: str(x) if isinstance(x, (int, float)) else x) #change to int
+df_copy = df.rename(columns={
+    "CONCEPT NAME ":"concept_set_name",
+    "AGREED":"concept_set_status"
+})
+df_copy["concept_set_status"] = df_copy["concept_set_status"].replace("USE", "AGREED")
+df_copy = df_copy[["concept_set_name", "concept_set_status"]]
+outs = df_copy.to_dict(orient='records')
+for i, out in enumerate(outs):
+    out["metadata"] = dict(df[cols].iloc[i])
+json.dumps(outs)
 ```
-%% Output
+%% Cell type:code id:8a204a95-dc4c-4183-9ea7-f5c5e95e9087 tags:
-    Empty DataFrame
+``` python
-    Columns: [bnf_code, snomed_code, read2_code]
+```
-    Index: []
-%% Cell type:code id:b3166cf0-e4a5-43e0-aeac-78827427422e tags:
+%% Cell type:code id:5ce1ab58-50b4-4c22-b72b-c698de6830f7 tags:
 ``` python
-.astype(str).dtypes
+import json
 ```
-%% Output
+%% Cell type:code id:f1ea81c6-d1db-408f-9d3a-b96f44efe21f tags:
+``` python
+```
+%% Cell type:markdown id:5eb544a3-9dd1-41e8-88c2-a808646c6112 tags:
+### OMOP Database
+%% Cell type:code id:c9e58e62-9e44-4d0c-9d8d-35c175c07e6c tags:
+``` python
+import sqlite3
+import csv
+import pandas as pd
+import os
+```
+%% Cell type:code id:4f67c9a1-373f-4799-8a85-72767662d912 tags:
+``` python
+```
+%% Cell type:code id:d0ecdf69-ee90-42c1-ad25-d8357b603d1b tags:
+``` python
+#IMPORT OMOP VOCABS
+conn = sqlite3.connect("codes/omop_54.sqlite") # change to 'sqlite:///your_filename.db'
+folder_path = "codes/vocabulary_download_v5_{9424944c-2b76-4127-8f05-f535e0f15e2a}_1731661390540"
+# Check if the folder exists
+if not os.path.isdir(folder_path):
+    raise Exception(f"Error: The folder '{folder_path}' does not exist.")
+# Iterate through files in the folder
+for filename in os.listdir(folder_path):
+    if filename.endswith(".csv"):  # Check if the file is a CSV
+        file_path = os.path.join(folder_path, filename)
+        try:
+            print(f"Reading file: {file_path}")
+            # Read the CSV file with the specified delimiter
+            df = pd.read_csv(file_path, delimiter="\t", low_memory=False)
+            table_name = os.path.splitext(os.path.basename(file_path))[0] #Get name of file
+            #Export Table to sqlite db
+            df.to_sql(table_name, conn, if_exists='replace', index=False)
+        except Exception as e:
+             raise Exception(f"Error reading file {file_path}: {e}")
+conn.commit()
+conn.close()
+```
+%% Cell type:code id:b9cafd0c-a3bd-408b-bca8-b0de2acde1cd tags:
+``` python
+# Create a SQL connection to our SQLite database
+conn = sqlite3.connect("codes/omop_54.sqlite")
+cur = conn.cursor()
+#Print ALL Columns in Table
+# table="CONCEPT_SET"
+# cur.execute(f"PRAGMA table_info({table});")
+# print(pd.DataFrame(cur.fetchall()))
+#Print ALL TABLE NAMES
+# cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=? ;", ("VOCABULARY",))
+# print(cur.fetchone())
+cur.execute("SELECT vocabulary_id FROM VOCABULARY WHERE vocabulary_id=? ;", ("MELDB",))
+print(cur.fetchone())
+#Print WHOLE TABLE
+# cur.execute('SELECT * FROM CONCEPT;')
+# cur.execute('SELECT * FROM CONCEPT WHERE standard_concept = "C";')
+# cur.execute('SELECT * FROM CONCEPT WHERE concept_code = "119768002" LIMIT 1;')
+# cur.execute('SELECT * FROM CONCEPT WHERE concept_code IN ("119768002", "5905001");')
+# cur.execute('SELECT DISTINCT VOCABULARY_ID FROM CONCEPT;')
+# df = pd.DataFrame(cur.fetchall())
+# print(list(df[0]))
+# display(df)
+# for row in :
+    # print(row)
-    BNF Code       object
+#Get Header of Table
-    SNOMED Code    object
+# table="CONCEPT_CLASS"
-    dtype: object
+# cur.execute(f"SELECT * FROM {table} LIMIT 3;")
+# print(cur.fetchall())
+#create meldb VOCABULARY
+# meldb_version='v3.2.10'
+# meldb_description = 'Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity'
+# meldb_reference = 'https://www.it-innovation.soton.ac.uk/projects/meldb'
+# df_test = pd.DataFrame([{
+#     "vocabulary_id": 'MELDB',
+#     "vocabulary_name": meldb_description,
+#     "vocabulary_reference": meldb_reference,
+#     "vocabulary_version": meldb_version,
+#     # "vocabulary_concept_id": 0,
+# }])
+# df_test.to_sql("VOCABULARY", conn, if_exists='append', index=False)
+# cur.execute("""
+# CREATE TABLE CONCEPT_SET (
+#     concept_set_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each concept set
+#     atlas_id INTEGER,                                -- Unique identifier generated by ATLAS
+#     concept_set_name TEXT,                           -- Optional name for the concept set
+#     concept_set_description TEXT,                    -- Optional description for the concept set
+#     vocabulary_id TEXT NOT NULL,                     -- Foreign key to VOCABULARY table
+#     FOREIGN KEY (vocabulary_id) REFERENCES VOCABULARY(vocabulary_id)
+# );""")
+# cur.execute("DROP TABLE CONCEPT_SET;")
+# cur.execute("""
+# CREATE TABLE CONCEPT_SET_ITEM (
+#     concept_set_item_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each mapping
+#     concept_set_id INTEGER NOT NULL,                      -- Foreign key to CONCEPT_SET table
+#     concept_id INTEGER NOT NULL,                          -- Foreign key to CONCEPT table
+#     FOREIGN KEY (concept_set_id) REFERENCES CONCEPT_SET(concept_set_id),
+#     FOREIGN KEY (concept_id) REFERENCES CONCEPT(concept_id)
+# );""")
+# cur.execute("DROP TABLE CONCEPT_SET_ITEM;")
+# Be sure to close the connection
+conn.close()
+```
+%% Cell type:code id:d03b75f3-902f-42d7-b52f-dac7e79ecb11 tags:
+``` python
+conn = sqlite3.connect("codes/omop_54.sqlite") # change to 'sqlite:///your_filename.db'
+cur = conn.cursor()
+file_path = "/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/HEART_VALVE_DISORDERS.csv"
+df = pd.read_csv(file_path, low_memory=False)
+df = df.set_index("code")
+df.to_sql(name='test', con=conn, if_exists='replace')
+conn.commit()
+conn.close()
+```
+%% Cell type:code id:d96c3511-3831-400e-ba40-0a36abcc60d3 tags:
+``` python
+#DISPLAY SQL TABLE
+table="CONCEPT_SET_ITEM"
+# Create a SQL connection to our SQLite database
+conn = sqlite3.connect("codes/omop_54.sqlite")
+cur = conn.cursor()
+#Print ALL Columns in Table
+cur.execute(f"PRAGMA table_info({table});")
+df_cols = pd.DataFrame(cur.fetchall())
+print(df_cols)
+df_cols = df_cols[1]
+#Print TABLE
+cur.execute(f"SELECT * FROM {table};")
+df = pd.DataFrame(cur.fetchall())
+df = df.rename(columns={i:s for i, s in enumerate(df_cols)})
+display(df)
+conn.close()
+# a+s = 13364
+# a+s+i = 13591
+```
+%% Cell type:code id:42d49a00-9646-4ba4-afb6-12297289b7a7 tags:
+``` python
+def sql_row_exist(conn, table, column, value):
+	# Execute and check if a result exists
+	cur = conn.cursor()
+	query = f"SELECT 1 FROM {table} WHERE {column} = ? LIMIT 1;"
+	cur.execute(query, (value,))
+	exists = cur.fetchone() is not None
+	return exists
+```
+%% Cell type:code id:f7b51bcd-6ee1-4023-8d36-7f419ce4120d tags:
+``` python
+#EXPORT MELDB CSV OUTPUT
+conn = sqlite3.connect("codes/omop_54.sqlite") # change to 'sqlite:///your_filename.db'
+cur = conn.cursor()
+vocab_output = "MELDB"
+vocab_type = "SNOMED"
+file_path = "/home/jjd1c23/ssd/meldb/jjd1c23/phenotype/output/V3_2_10_MELD_snomed_no_translate.csv"
+# file_path = "/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/HEART_VALVE_DISORDERS.csv"
+# Read the CSV file with the specified delimiter
+out = pd.read_csv(file_path, low_memory=False)
+print(df.columns)
+for concept_set_name, grp in out.groupby("MELDB_concept"):
+    # display(concept_set_name, grp[["code", "MELDB_concept"]])
+    #Create Concept_Set
+    if not sql_row_exist(conn, "CONCEPT_SET", "concept_set_name", concept_set_name):
+        cur.execute(f"INSERT INTO CONCEPT_SET (concept_set_name, vocabulary_id) VALUES ('{concept_set_name}', 'MELDB');")
+    else:
+        print("concept_set", concept_set_name, "already exists")
+        #TODO: ask to remove old concept_set?
+    #Get Concept_set_Id
+    query = "SELECT concept_set_id FROM CONCEPT_SET WHERE concept_set_name = ? AND vocabulary_id = ?;"
+    cur.execute(query, (concept_set_name, vocab_output, ))
+    concept_set_id = cur.fetchone()[0]
+    #Get corresponing Concept_id (OMOP) for each Concept_code (e.g. SNOMED)
+    concept_codes = "'"+"', '".join(list(grp["code"].astype(str)))+"'"
+    query = f"SELECT concept_id FROM CONCEPT WHERE vocabulary_id = ? AND concept_code IN ({concept_codes});"
+    print(query)
+    cur.execute(query, (vocab_type, ))
+    df_out = pd.DataFrame(cur.fetchall(), columns=["concept_id"])
+    if not len(grp) == len(df_out):
+        print("ERROR: Some", vocab_type, "Codes do not exist in OMOP Database")
+    #Create Concept_set_item
+    df_out["concept_set_id"] = concept_set_id
+    df_out.to_sql("CONCEPT_SET_ITEM", conn, if_exists='append', index=False)
+    display(df_out)
+    # break
+#         #Create New CONCEPT_SET
+#         table_name = os.path.splitext(os.path.basename(file_path))[0] #Get name of file
+#         cur.execute(f"INSERT INTO CONCEPT_SET (concept_class_name) VALUES ('{table_name}');")
+conn.commit()
+conn.close()
+```
+%% Cell type:code id:85007741-e34c-4112-a63c-9fb302b76958 tags:
+``` python
+"'"+"', '".join(list(grp["code"].astype(str)))+"'"
+```
+%% Cell type:markdown id:423e7c21-f3bd-439d-9dcb-c17cc2cc6854 tags:
+### ATLAS
+%% Cell type:code id:c6b45e4d-c7d2-42e7-9b4a-0e9c1c86d34b tags:
+``` python
+#Create ATLAS Concept Set
+def atlas_create_concept(name, description="", items=[]):
+    data={
+      "id": 0,
+      "name": name,
+      "description": description,
+      "expression": {
+          "items":items
+      }
+    }
+    try:
+        # Sending the POST request
+        response = requests.post(url, json=data, headers=headers)
+        # Check the response status
+        if response.status_code == 200 or response.status_code == 201:
+            print("POST request successful:")
+            print(response.json())  # Assuming the response is JSON
+            return response["id"]
+        else:
+            print(f"POST request failed. HTTP Status Code: {response.status_code}")
+            print("Response content:")
+            print(response.text)
+            return None
+    except requests.exceptions.RequestException as e:
+        print(f"An error occurred: {e}")
+# Heart Test 1 - 1885487
+# Heart Test 2 - 1885488
+# Heart Valve Disorders - 1885449
+```
+%% Cell type:code id:45497623-1da0-4f74-b21e-da8811c89b04 tags:
+``` python
+def get_omop_concepts(cur, codes, vocab_id):
+    #Create List for SQL
+    mask = ""
+    for c in codes:
+        mask+=f'"{c}", '
+    mask = mask[:-2] #remove last comma
+    #Execute SQL
+    cur.execute(f'SELECT * FROM CONCEPT WHERE concept_code IN ({mask}) AND VOCABULARY_ID = "{vocab_id}";')
+    df = pd.DataFrame(cur.fetchall()) #convert to pandas df
+    print("Identified", len(df[0]) ,"OMOP Concepts:", list(df[0]))
+    return df
+def omop_concepts_to_atlas_json(df):
+    json = []
+    for i, row in df.iterrows():
+        #template for atlas api
+        out = {
+            "concept": {
+                'CONCEPT_ID': row[0],
+                'CONCEPT_NAME': row[1],
+                'STANDARD_CONCEPT': 'S',
+                'STANDARD_CONCEPT_CAPTION': 'Standard',
+                'INVALID_REASON': 'V',
+                'INVALID_REASON_CAPTION': 'Valid',
+                'CONCEPT_CODE': row[6],
+                'DOMAIN_ID': row[2],
+                'VOCABULARY_ID': row[3],
+                'CONCEPT_CLASS_ID': row[4],
+                'VALID_START_DATE': int(row[7]),
+                'VALID_END_DATE': int(row[8])
+            },
+            'isExcluded': False,
+            'includeDescendants': False,
+            'includeMapped': False
+        }
+        json.append(out)
+    return json
+conn = sqlite3.connect("codes/omop_54.sqlite")
+cur = conn.cursor()
+vocab_id="SNOMED" #SNOMED, ATC, ICD10CM, ICD9CM, Read
+csv_output = "/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/ANGER.csv"
+#Load CSV Output File
+df_in = pd.read_csv(csv_output)
+print(len(df_in))
+# df = get_omop_concepts(cur, ["119768002", "5905001"], "SNOMED")
+df = get_omop_concepts(cur, list(df_in["code"]), vocab_id)
+json = omop_concepts_to_atlas_json(df)
+# display(json)
+conn.close()
+```
+%% Cell type:code id:ea759907-c085-472a-82e2-07b6b19e2c8f tags:
+``` python
+#ATLAS GET CONCEPT SET
+import requests
+def request_get(url):
+    try:
+        # Sending the GET request
+        response = requests.get(url)
+        # Check if the response status code is 200 (OK)
+        if response.status_code == 200:
+            print("Response data:")
+            # print(response.json())  # Assuming the response is in JSON format
+            return response.json()
+        else:
+            print(f"Failed to fetch data. HTTP Status Code: {response.status_code}")
+            print("Response content:")
+            print(response.text)
+            return None
+    except requests.exceptions.RequestException as e:
+        print(f"An error occurred: {e}")
+#GET SET INFO
+set_id = "1885449"
+url = f"https://atlas-demo.ohdsi.org/WebAPI/conceptset/{set_id}"
+request_get(url)
+```
+%% Cell type:code id:5a70e636-6051-4930-bf1b-30d093fd0552 tags:
+``` python
+#GET SET ITEMS (Concepts)
+set_id = "1885449"
+url = f"https://atlas-demo.ohdsi.org/WebAPI/conceptset/{set_id}/expression/ATLASPROD"
+response = request_get(url)
+display(response)
+```
+%% Cell type:code id:96bfcd9c-27e8-4be4-a680-7553d908790e tags:
+``` python
+#ATLAS CREATE CONCEPT SET
+```