removed conda, notebook and scripts as they are no longer needed

50a29145 · mjbonifa · 18c4229c · 18c4229c · 18c4229c · 18c4229c
Commit 50a29145 authored 4 months ago by mjbonifa
--- a/conda.yaml
+++ b/conda.yaml
-name: acmc
-channels:
-  - conda-forge
-dependencies:
-  - _libgcc_mutex=0.1=conda_forge
-  - _openmp_mutex=4.5=2_gnu
-  - asttokens=3.0.0=pyhd8ed1ab_1
-  - blosc=1.21.6=he440d0b_1
-  - brotli-python=1.1.0=py313h46c70d0_2
-  - bzip2=1.0.8=h4bc722e_7
-  - c-ares=1.34.4=hb9d3cd8_0
-  - c-blosc2=2.16.0=h3122c55_0
-  - ca-certificates=2025.1.31=hbcca054_0
-  - certifi=2025.1.31=pyhd8ed1ab_0
-  - cffi=1.17.1=py313hfab6e84_0
-  - charset-normalizer=3.4.1=pyhd8ed1ab_0
-  - comm=0.2.2=pyhd8ed1ab_1
-  - debugpy=1.8.12=py313h46c70d0_0
-  - decorator=5.1.1=pyhd8ed1ab_1
-  - exceptiongroup=1.2.2=pyhd8ed1ab_1
-  - executing=2.1.0=pyhd8ed1ab_1
-  - h2=4.2.0=pyhd8ed1ab_0
-  - hdf5=1.14.3=nompi_h2d575fe_109
-  - hpack=4.1.0=pyhd8ed1ab_0
-  - hyperframe=6.1.0=pyhd8ed1ab_0
-  - idna=3.10=pyhd8ed1ab_1
-  - importlib-metadata=8.6.1=pyha770c72_0
-  - ipykernel=6.29.5=pyh3099207_0
-  - ipython=8.32.0=pyh907856f_0
-  - jedi=0.19.2=pyhd8ed1ab_1
-  - jupyter_client=8.6.3=pyhd8ed1ab_1
-  - jupyter_core=5.7.2=pyh31011fe_1
-  - keyutils=1.6.1=h166bdaf_0
-  - krb5=1.21.3=h659f571_0
-  - ld_impl_linux-64=2.43=h712a8e2_2
-  - libaec=1.1.3=h59595ed_0
-  - libblas=3.9.0=28_h59b9bed_openblas
-  - libcblas=3.9.0=28_he106b2a_openblas
-  - libcurl=8.11.1=h332b0f4_0
-  - libedit=3.1.20250104=pl5321h7949ede_0
-  - libev=4.33=hd590300_2
-  - libexpat=2.6.4=h5888daf_0
-  - libffi=3.4.6=h2dba641_0
-  - libgcc=14.2.0=h77fa898_1
-  - libgcc-ng=14.2.0=h69a702a_1
-  - libgfortran=14.2.0=h69a702a_1
-  - libgfortran5=14.2.0=hd5240d6_1
-  - libgomp=14.2.0=h77fa898_1
-  - liblapack=3.9.0=28_h7ac8fdf_openblas
-  - liblzma=5.6.4=hb9d3cd8_0
-  - libmpdec=4.0.0=h4bc722e_0
-  - libnghttp2=1.64.0=h161d5f1_0
-  - libopenblas=0.3.28=pthreads_h94d23a6_1
-  - libsodium=1.0.20=h4ab18f5_0
-  - libsqlite=3.48.0=hee588c1_1
-  - libssh2=1.11.1=hf672d98_0
-  - libstdcxx=14.2.0=hc0a3c3a_1
-  - libstdcxx-ng=14.2.0=h4852527_1
-  - libuuid=2.38.1=h0b41bf4_0
-  - libzlib=1.3.1=hb9d3cd8_2
-  - lz4-c=1.10.0=h5888daf_1
-  - matplotlib-inline=0.1.7=pyhd8ed1ab_1
-  - ncurses=6.5=h2d0b736_3
-  - nest-asyncio=1.6.0=pyhd8ed1ab_1
-  - nomkl=1.0=h5ca1d4c_0
-  - numexpr=2.10.2=py313h5f97788_100
-  - numpy=2.2.3=py313h17eae1a_0
-  - openssl=3.4.1=h7b32b05_0
-  - packaging=24.2=pyhd8ed1ab_2
-  - pandas=2.2.3=py313ha87cce1_1
-  - parso=0.8.4=pyhd8ed1ab_1
-  - pexpect=4.9.0=pyhd8ed1ab_1
-  - pickleshare=0.7.5=pyhd8ed1ab_1004
-  - pip=25.0.1=pyh145f28c_0
-  - platformdirs=4.3.6=pyhd8ed1ab_1
-  - prompt-toolkit=3.0.50=pyha770c72_0
-  - psutil=6.1.1=py313h536fd9c_0
-  - ptyprocess=0.7.0=pyhd8ed1ab_1
-  - pure_eval=0.2.3=pyhd8ed1ab_1
-  - py-cpuinfo=9.0.0=pyhd8ed1ab_1
-  - pycparser=2.22=pyh29332c3_1
-  - pygments=2.19.1=pyhd8ed1ab_0
-  - pysocks=1.7.1=pyha55dd90_7
-  - pytables=3.10.2=py313hd261420_1
-  - python=3.13.1=ha99a958_105_cp313
-  - python-dateutil=2.9.0.post0=pyhff2d567_1
-  - python-tzdata=2025.1=pyhd8ed1ab_0
-  - python_abi=3.13=5_cp313
-  - pytz=2024.1=pyhd8ed1ab_0
-  - pyzmq=26.2.1=py313h8e95178_0
-  - readline=8.2=h8228510_1
-  - requests=2.32.3=pyhd8ed1ab_1
-  - six=1.17.0=pyhd8ed1ab_0
-  - snappy=1.2.1=h8bd8927_1
-  - stack_data=0.6.3=pyhd8ed1ab_1
-  - tk=8.6.13=noxft_h4845f30_101
-  - tornado=6.4.2=py313h536fd9c_0
-  - traitlets=5.14.3=pyhd8ed1ab_1
-  - typing-extensions=4.12.2=hd8ed1ab_1
-  - typing_extensions=4.12.2=pyha770c72_1
-  - tzdata=2025a=h78e105d_0
-  - urllib3=2.3.0=pyhd8ed1ab_0
-  - wcwidth=0.2.13=pyhd8ed1ab_1
-  - zeromq=4.3.5=h3b0a872_7
-  - zipp=3.21.0=pyhd8ed1ab_1
-  - zlib-ng=2.2.4=h7955e40_0
-  - zstandard=0.23.0=py313h80202fe_1
-  - zstd=1.5.6=ha6fb4c9_0
-  - pip:
-      - aiosqlite==0.21.0
-      - click==8.1.8
-      - cramjam==2.9.1
-      - et-xmlfile==2.0.0
-      - fastparquet==2024.11.0
-      - fsspec==2025.2.0
-      - gitdb==4.0.12
-      - gitpython==3.1.44
-      - greenlet==3.1.1
-      - iniconfig==2.0.0
-      - lxml==5.3.1
-      - openpyxl==3.1.5
-      - pluggy==1.5.0
-      - pyarrow==19.0.0
-      - pyomop==4.3.0
-      - pytest==8.3.4
-      - simpledbf==0.2.6
-      - smmap==5.0.2
-      - sqlalchemy==2.0.38
-prefix: /opt/conda/envs/acmc
--- a/notebook/process_codes_WP.ipynb
+++ b/notebook/process_codes_WP.ipynb
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8c8f4cdf-04a5-4762-895e-6555781a1f05",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "import json"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c5786d78-7dc2-4f02-ad21-cee95e473823",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true,
-    "tags": []
-   },
-   "source": [
-    "### Ho generate JSON"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0292dc90-e31a-4724-8536-d0b55533aaef",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#List v4 to json\n",
-    "\n",
-    "df = pd.read_excel(\"PHEN_code_lists_sources_V4.xlsx\", sheet_name=\"ho\", dtype=str)\n",
-    "# df = df.sort_values(by=\"mapped_condition\")\n",
-    "\n",
-    "def json_file_template(file, cons, types, metadata):\n",
-    "    concepts = \"\"\n",
-    "    for concept in cons:\n",
-    "        concepts += f'\"{concept}\", '\n",
-    "    concepts = concepts[:-2] #remove last ,\n",
-    "    \n",
-    "    type_str = \"\"\n",
-    "    for k, v in types.items():\n",
-    "        type_str += f'\"{k}\":\"{v}\", '\n",
-    "    type_str = type_str[:-2]\n",
-    "    \n",
-    "    meta_str = '\"metadata\":['\n",
-    "    for v in metadata:\n",
-    "        meta_str += f'\"{v}\", '\n",
-    "    meta_str = meta_str[:-2]\n",
-    "    meta_str = meta_str + \"]\"\n",
-    "    \n",
-    "    return '''\n",
-    "    { \n",
-    "        \\\"file\\\":\\\"'''+file+'''\",\n",
-    "        \\\"columns\\\":{\n",
-    "            '''+type_str+''',\n",
-    "            '''+meta_str+'''\n",
-    "        },\n",
-    "        \\\"meldb_phenotypes\\\":['''+concepts+''']\n",
-    "    },'''\n",
-    "\n",
-    "out = '\"files\":['\n",
-    "folder = \"codes/GitHub_TG_repository/\"\n",
-    "for file, grp in df.groupby(\"mapped_condition\"):\n",
-    "    file = file.replace(\"%20\", \" \") \n",
-    "    \n",
-    "    for ext in [\"_CPRD_GOLD.csv\", \"_CPRD_AURUM.csv\", \"_IMRD.csv\"]:\n",
-    "        path = file+\"/\"+file+ext\n",
-    "        if os.path.isfile(folder+path):\n",
-    "            out+= json_file_template(path, grp[\"meldb_condition\"],\n",
-    "                                     types={\n",
-    "                                         \"read2_code\":\"READ_CODE\",\n",
-    "                                         \"snomed_code\":\"SNOMED_CT_CODE\",\n",
-    "                                         # \"med_code\":\"MEDICAL_CODE_ID\",\n",
-    "                                     },\n",
-    "                                     metadata = [\"DESCRIPTION\"]\n",
-    "                                    )\n",
-    "        else:\n",
-    "            print(\"NOT FILE\", folder+path)\n",
-    "    for ext in [\"_ICD10.csv\"]:\n",
-    "        path = file+\"/\"+file+ext\n",
-    "        if os.path.isfile(folder+path):\n",
-    "            out+= json_file_template(path, grp[\"meldb_condition\"],\n",
-    "                                     types={\n",
-    "                                         \"icd10_code\":\"READ_CODE\",\n",
-    "                                         \"snomed_code\":\"SNOMED_CT_CODE\",\n",
-    "                                         # \"icd10_code\":\"MEDICAL_CODE_ID\",\n",
-    "                                     },\n",
-    "                                     metadata = [\"DESCRIPTION\"]\n",
-    "                                    )\n",
-    "        else:\n",
-    "            print(\"NOT FILE\", folder+path)\n",
-    "    \n",
-    "            \n",
-    "            \n",
-    "    # out+= json_file_template(file+\"/\"+file+\"_CPRD_AURUM.csv\", grp[\"meldb_condition\"])\n",
-    "    # out+= json_file_template(file+\"/\"+file+\"_ICD10.csv\", grp[\"meldb_condition\"])\n",
-    "    # out+= json_file_template(file+\"/\"+file+\"_IMRD.csv\", grp[\"meldb_condition\"])\n",
-    "\n",
-    "    # out += f' \"{file}/{file}_CPRD_GOLD.csv\":[{conds}],\\n'\n",
-    "    # out += f' \"{file}/{file}_CPRD_AURUM.csv\":[{conds}],\\n'\n",
-    "    # out += f' \"{file}/{file}_ICD10.csv\":[{conds}],\\n'\n",
-    "    # out += f' \"{file}/{file}_IMRD.csv\":[{conds}],\\n'\n",
-    "    \n",
-    "out = out[:-1] #remove last ,\n",
-    "out += \"\\n]\"\n",
-    "out = out.replace(\"%20\", \" \")    \n",
-    "print(out)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f155b635-b459-4aff-81b2-e065fc223858",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d040eda5-4028-4047-834c-7315e307e415",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = pd.read_parquet(\"maps/processed/icd10_code.parquet\")\n",
-    "df\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e0228ac9-8852-4818-b7f0-98429ca5229c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "code = [\"A00.0\", \"*00.0\"]\n",
-    "code = pd.Series(code)\n",
-    "print(code.isin(df[\"icd10_code\"]))\n",
-    "print(code.isin(df[\"icd10_alt_code\"]))\n",
-    "# print(  )\n",
-    "~(\n",
-    "    ~code.isin(df[\"icd10_code\"]) \n",
-    "    &\n",
-    "    ~code.isin(df[\"icd10_alt_code\"])\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "18efcacd-45f0-4341-86cc-d8e2e584350c",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true,
-    "tags": []
-   },
-   "source": [
-    "### Analyse the JSON file"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "85dc197b-451e-4fa9-a53b-e6770c132123",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "import os\n",
-    "\n",
-    "path_json = \"../concepts/PHEN_assign_v3.json\"\n",
-    "\n",
-    "#Load JSON Concept Definitions\n",
-    "mapping = json.load(open(path_json,'rb'))\n",
-    "summary_config = mapping[\"concept_sets\"][\"concept_set\"]\n",
-    "summary_df = pd.DataFrame(summary_config) #change to dataframe\n",
-    "\n",
-    "summary_df = summary_df.join(pd.json_normalize(summary_df[\"metadata\"])) #metadata to columns\n",
-    "summary_df = summary_df.drop(columns=[\"metadata\"])\n",
-    "summary_df = summary_df.rename(columns={\"concept_set_name\":\"CONCEPT_SET\"})\n",
-    "summary_df = summary_df.drop_duplicates() #remove duplicates\n",
-    " \n",
-    "summary_df\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4c9b6b3f-08aa-4f61-b9b2-44a24b5d00a0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "import os\n",
-    "\n",
-    "path_json = \"PHEN_assign_v3.json\"\n",
-    "path_excel = \"PHEN_summary_working.xlsx\"\n",
-    "path_codes = \"codes/\"\n",
-    "\n",
-    "#Get all Files in JSON\n",
-    "def get_json_files(path_json):\n",
-    "    folders = json.load(open(path_json,'rb'))\n",
-    "    out = []\n",
-    "    for folder in folders:\n",
-    "        if \"files\" in folder:\n",
-    "            for file in folder[\"files\"]:\n",
-    "                file_path = folder[\"folder\"]+\"/\"+file[\"file\"]\n",
-    "                if \"meldb_phenotypes\" in file:\n",
-    "                    for concept in file[\"meldb_phenotypes\"]:\n",
-    "                        out.append({\"json_concept\":concept, \"filepath\":file_path, \"json_code_types\":list(file[\"columns\"].keys())})\n",
-    "                elif \"meldb_phenotypes_categories\" in file:\n",
-    "                    for code, concept in file[\"meldb_phenotypes_categories\"].items():\n",
-    "                        out.append({\"json_concept\":concept[0], \"filepath\":file_path, \"json_code_types\":list(file[\"columns\"].keys())})\n",
-    "                else:\n",
-    "                    out.append({\"json_concept\":None, \"filepath\":file_path})\n",
-    "\n",
-    "    out = pd.DataFrame(out)\n",
-    "    out[\"filepath\"] = out[\"filepath\"].astype(str)\n",
-    "    return out\n",
-    "out = get_json_files(path_json)\n",
-    "\n",
-    "#Get all Files Excel Summary\n",
-    "def get_excel_files(path_excel):\n",
-    "    path_excel = \"PHEN_summary_working.xlsx\"\n",
-    "    out2 = pd.read_excel(path_excel)\n",
-    "    out2 = out2[[\"CONCEPT NAME \", \"CODING LIST\", \"AGREED\", \"FUNCTION\"]].loc[1:] #select relevant columns\n",
-    "\n",
-    "    #Filter Concepts in use\n",
-    "    out2 = out2[out2[\"AGREED\"] == \"USE\"] #remove deprecated concepts\n",
-    "    out2 = out2[out2[\"FUNCTION\"] == \"QUERY BY CODING LIST\"] #remove deprecated concepts\n",
-    "    out2 = out2.drop(['AGREED', 'FUNCTION'], axis=1)\n",
-    "\n",
-    "    #Get filepaths\n",
-    "    out2[\"CODING LIST\"] = out2[\"CODING LIST\"].str.split(\",\") #split by ,\n",
-    "    out2 = out2.explode(\"CODING LIST\") #one row per file\n",
-    "    out2[\"CODING LIST\"] = out2[\"CODING LIST\"].str.strip()\n",
-    "    out2[\"CODING LIST\"] = out2[\"CODING LIST\"].str.replace(\"https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/\", \"\")\n",
-    "    out2[\"CODING LIST\"] = out2[\"CODING LIST\"].str.replace(\"%20\", \" \")\n",
-    "\n",
-    "    out2 = out2.rename(columns={\"CONCEPT NAME \":\"excel_concept\", \"CODING LIST\":\"filepath\"})\n",
-    "    return out2\n",
-    "out2 = get_excel_files(path_excel)\n",
-    "\n",
-    "#Get all Files in /codes\n",
-    "def get_code_files(path_codes):\n",
-    "    all_files = []\n",
-    "    for root, dirs, files in os.walk(path_codes, topdown=False):\n",
-    "        for name in files:\n",
-    "            if \".ipynb_checkpoint\" not in root: #exclude notebook checkpoints\n",
-    "                if name.endswith(\".csv\") or name.endswith(\".xlsx\") or name.endswith(\".dta\"): #exclude non-data files\n",
-    "                    all_files.append(os.path.join(root, name)) \n",
-    "    all_files = pd.DataFrame(all_files)\n",
-    "    all_files = all_files.rename(columns={0:\"filepath\"})\n",
-    "    all_files[\"filepath\"] = all_files[\"filepath\"].astype(str)\n",
-    "    return all_files\n",
-    "all_files = get_code_files(path_codes)\n",
-    "\n",
-    "\n",
-    "print(\"ALL FILES\", len(all_files), len(all_files[\"filepath\"].unique()))\n",
-    "print(\"JSON CONCEPTS\", len(out), len(out[\"filepath\"].unique()))\n",
-    "print(\"EXCEL CONCEPTS\", len(out2), len(out2[\"filepath\"].unique()))\n",
-    "\n",
-    "outs = pd.merge(all_files, out, how=\"outer\", on=\"filepath\")\n",
-    "outs = pd.merge(outs, out2, how=\"outer\", on=\"filepath\")\n",
-    "print(len(outs), len(outs[\"filepath\"].unique()))\n",
-    "outs.to_csv(\"output/MELD_file_to_concept.csv\", index=False)\n",
-    "\n",
-    "# display(outs[ outs[\"concept\"].isna()])\n",
-    "\n",
-    "# display(out )                "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f8e70c33-c869-46f8-953e-f6b52992cfbb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "display(\"JSON MISSING\", outs[outs[\"json_concept\"].isna() & outs[\"excel_concept\"].notna()])\n",
-    "display(\"EXCEL MISSING\", outs[outs[\"json_concept\"].notna() & outs[\"excel_concept\"].isna()])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9d84465f-f064-4df2-b0e4-2dfb217aea21",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "f =  open('concepts-output/MELD-report.md', 'a') as f:\n",
-    "    f.write(\n",
-    "    \"\"\"\n",
-    "# Report\n",
-    "- One thing\n",
-    "- Two thing\n",
-    "- Three thing\n",
-    "    \"\"\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7f7fc771-e406-42c7-8a09-16a20b5298f5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "total_length = 0\n",
-    "for file in all_files[\"filepath\"]:\n",
-    "    if file.endswith(\".csv\"):\n",
-    "        df_file = pd.read_csv(file)\n",
-    "        total_length += len(df_file)\n",
-    "    elif file.endswith(\".xlsx\"):\n",
-    "        df_file = pd.read_excel(file)\n",
-    "        total_length += len(df_file)\n",
-    "    elif file.endswith(\".dta\"):\n",
-    "        df_file = pd.read_stata(file)\n",
-    "        total_length += len(df_file)\n",
-    "total_length\n",
-    "    "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "08a9c565-28d6-46ee-9fa8-6fa0ee28a4d5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#turn filepaths into gitlab links\n",
-    "outs2 = outs.copy()\n",
-    "outs2[\"filepath\"] = \"https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/\"+outs2[\"filepath\"].str.replace(\" \", \"%20\")\n",
-    "\n",
-    "#Groupby concepts and concat filepaths\n",
-    "outs2 = outs2.groupby(\"concept\")[\"filepath\"].apply(', '.join).reset_index()\n",
-    "outs2 = outs2.sort_values(by=[\"concept\"])\n",
-    "outs2\n",
-    "outs2.to_csv(\"output/MELD_GitLab_link_to_concept.csv\", index=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "357bb84c-90c2-4b5f-95c0-443191783a7f",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true,
-    "tags": []
-   },
-   "source": [
-    "### Analyse Output Files"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7d3f9cb7-be86-4f1f-92f6-991094eb7bb7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "version = \"V2_2_2\"\n",
-    "output_files = [f\"output/{version}_MELD_concepts_readv2.csv\",\n",
-    "                f\"output/{version}_MELD_snomed_no_translate.csv\",\n",
-    "                f\"output/{version}_MELD_icd10_no_translate.csv\",\n",
-    "                # f\"output/{version}_MELD_med_no_translate.csv\",\n",
-    "                f\"output/{version}_MELD_atc_no_translate.csv\"\n",
-    "               ]\n",
-    "error_file = f\"output/{version}_MELD_errors.csv\"\n",
-    "\n",
-    "for output_file in output_files:\n",
-    "    print(\"---\"*3,output_file,\"---\"*3,)\n",
-    "    df = pd.read_csv(output_file)\n",
-    "    # df[\"MELDB_concept\"].loc[df[\"CONCEPT TYPE\"].isna()]\n",
-    "    print(\"MELDB missing concepts \", len(df[df[\"CONCEPT TYPE\"].isna()]))\n",
-    "    if df[\"code\"].dtype == \"object\":\n",
-    "        print(\"Chars present:\", np.sort(df[\"code\"].apply(lambda x : set(x)).explode().unique()))\n",
-    "    \n",
-    "# len(df[\"MELDB_concept\"].unique())\n",
-    "\n",
-    "print(\"---\"*3,error_file,\"---\"*3,)\n",
-    "df = pd.read_csv(error_file)\n",
-    "df = df.drop_duplicates()\n",
-    "df[\"CODE_TYPE\"].value_counts()\n",
-    "# for i, row in df.drop_duplicates().iterrows():\n",
-    "#     print(row[\"CODE\"], row[\"CODE_TYPE\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "08e0ecc1-9271-48c3-9c5b-094800072906",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_output_files(version):\n",
-    "    output_files = [f\"output/{version}_MELD_concepts_readv2.csv\",\n",
-    "                f\"output/{version}_MELD_snomed_no_translate.csv\",\n",
-    "                f\"output/{version}_MELD_icd10_no_translate.csv\",\n",
-    "                # f\"output/{version}_MELD_med_no_translate.csv\",\n",
-    "                f\"output/{version}_MELD_atc_no_translate.csv\"\n",
-    "               ]\n",
-    "    error_file = f\"output/{version}_MELD_errors.csv\"\n",
-    "    return output_files, error_file\n",
-    "\n",
-    "# version_1 = \"V1_0_0\"\n",
-    "version_1 = \"V2_1_4\"\n",
-    "version_2 = \"V2_2_3\"\n",
-    "output1, err1 = get_output_files(version_1)\n",
-    "output2, err2 = get_output_files(version_2)\n",
-    "\n",
-    "print(\"## Compare Concepts\", version_1, \"to\", version_2)\n",
-    "\n",
-    "for out1, out2 in zip(output1, output2):\n",
-    "    print(out1, out2 )\n",
-    "    df1 = pd.read_csv(out1)\n",
-    "    df1 = df1[[\"code\",\"MELDB_concept\"]].groupby(\"MELDB_concept\").count()\n",
-    "    df2 = pd.read_csv(out2)\n",
-    "    df2 = df2[[\"code\",\"MELDB_concept\"]].groupby(\"MELDB_concept\").count()\n",
-    "    \n",
-    "    #Added/Removed Concepts\n",
-    "    print(\"- Removed Concepts\", list(set(df1.index) - set(df2.index)))\n",
-    "    print(\"- Added Concepts\",  list(set(df2.index) - set(df1.index)))\n",
-    "    \n",
-    "    #Changed Concepts\n",
-    "    diff = df2 - df1 #diff in counts \n",
-    "    diff = diff[(~(diff[\"code\"] == 0.0)) & diff[\"code\"].notna()] #get non-zero counts\n",
-    "    s = \"\\n\"\n",
-    "    for concept, row in diff.iterrows():\n",
-    "        s += \"\\t - {} {}\\n\".format(concept, row[\"code\"])\n",
-    "    print(\"- Changed Concepts\", s)\n",
-    "\n",
-    "\n",
-    "# for output_file in output_files:\n",
-    "#     print(\"---\"*3,output_file,\"---\"*3,)\n",
-    "#     df = pd.read_csv(output_file)\n",
-    "#     # df[\"MELDB_concept\"].loc[df[\"CONCEPT TYPE\"].isna()]\n",
-    "#     print(\"MELDB missing concepts \", len(df[df[\"CONCEPT TYPE\"].isna()]))\n",
-    "#     if df[\"code\"].dtype == \"object\":\n",
-    "#         print(\"Chars present:\", np.sort(df[\"code\"].apply(lambda x : set(x)).explode().unique()))\n",
-    "    "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cc60c137-5a85-4155-af6b-6796f8c05980",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import glob\n",
-    "import os\n",
-    "import pandas as pd\n",
-    "\n",
-    "df = pd.read_csv(\"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/PHEN_summary_working.csv\")\n",
-    "df = df.set_index(\"#\")\n",
-    "\n",
-    "for vocab in [\"atc\", \"icd10\", \"readv2\", \"snomed\"]:\n",
-    "    df[vocab.upper()] = \"\"\n",
-    "\n",
-    "    for file in glob.glob(f\"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/{vocab}/*.csv\"):\n",
-    "        concept_set = os.path.basename(file)[:-4]\n",
-    "        row_index = df[df[\"CONCEPT NAME \"] == concept_set].index[0]\n",
-    "\n",
-    "        df.loc[row_index, vocab.upper()] = \"YES\"\n",
-    "\n",
-    "df = df.drop(columns=[\"READv2_CODE\", \"ICD10_CODE\"])\n",
-    "df.to_csv(\"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/PHEN_summary_working_labelled.csv\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e5c4291f-847b-4c82-976e-bd5b3a7b6bcc",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true,
-    "tags": []
-   },
-   "source": [
-    "###  Mappings"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "08e34750-413c-469e-bcb8-e71bb188ff42",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "#NHS Read Browser\n",
-    "import simpledbf\n",
-    "import pandas as pd\n",
-    "\n",
-    "\n",
-    "#r2 only\n",
-    "df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ANCESTOR.DBF').to_dataframe()\n",
-    "df = pd.concat([df['READCODE'], df['DESCENDANT']])\n",
-    "df = pd.DataFrame(df.drop_duplicates())\n",
-    "df = df.rename(columns={0:\"read2_code\"})\n",
-    "df.to_parquet(\"maps/processed/read2_code.parquet\", index=False)\n",
-    "\n",
-    "#r2 -> atc\n",
-    "df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ATC.DBF').to_dataframe()\n",
-    "df = df[[\"READCODE\", \"ATC\"]]\n",
-    "df = df.rename(columns={\"READCODE\":\"read2_code\", \"ATC\":\"atc_code\"})\n",
-    "df.to_parquet(\"maps/processed/read2_code_to_atc_code.parquet\", index=False)\n",
-    "\n",
-    "#r2 -> icd10\n",
-    "df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ICD10.DBF').to_dataframe()\n",
-    "df = df[[\"READ_CODE\", \"TARG_CODE\"]]\n",
-    "df = df.rename(columns={\"READ_CODE\":\"read2_code\", \"TARG_CODE\":\"icd10_code\"})\n",
-    "df = df[~df[\"icd10_code\"].str.match(\"^.*-.*$\")] #remove codes with '-'\n",
-    "df = df[~df[\"read2_code\"].str.match(\"^.*-.*$\")] #remove codes with '-'\n",
-    "df.to_parquet(\"maps/processed/read2_code_to_icd10_code.parquet\", index=False)\n",
-    "\n",
-    "#r2 -> opcs4\n",
-    "df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/OPCS4V3.DBF').to_dataframe()\n",
-    "df = df[[\"READ_CODE\", \"TARG_CODE\"]]\n",
-    "df = df.rename(columns={\"READ_CODE\":\"read2_code\", \"TARG_CODE\":\"opcs4_code\"})\n",
-    "df = df[~df[\"opcs4_code\"].str.match(\"^.*-.*$\")] #remove codes with '-'\n",
-    "df = df[~df[\"read2_code\"].str.match(\"^.*-.*$\")] #remove codes with '-'\n",
-    "df.to_parquet(\"maps/processed/read2_code_to_opcs4_code.parquet\", index=False)\n",
-    "\n",
-    "#r3 only\n",
-    "df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ANCESTOR.DBF').to_dataframe()\n",
-    "df = pd.concat([df['READCODE'], df['DESCENDANT']])\n",
-    "df = pd.DataFrame(df.drop_duplicates())\n",
-    "df = df.rename(columns={0:\"read3_code\"})\n",
-    "df.to_parquet(\"maps/processed/read3_code.parquet\", index=False)\n",
-    "\n",
-    "#r3 -> icd10\n",
-    "df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ICD10.DBF').to_dataframe()\n",
-    "df = df[[\"READ_CODE\", \"TARG_CODE\"]]\n",
-    "df = df.rename(columns={\"READ_CODE\":\"read3_code\", \"TARG_CODE\":\"icd10_code\"})\n",
-    "df = df[~df[\"icd10_code\"].str.match(\"^.*-.*$\")] #remove codes with '-'\n",
-    "df = df[~df[\"read3_code\"].str.match(\"^.*-.*$\")] #remove codes with '-'\n",
-    "df.to_parquet(\"maps/processed/read3_code_to_icd10_code.parquet\", index=False)\n",
-    "\n",
-    "#r3 -> icd9\n",
-    "# dbf = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ICD9V3.DBF')\n",
-    "\n",
-    "#r3 -> opcs4\n",
-    "df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/OPCS4V3.DBF').to_dataframe()\n",
-    "df = df[[\"READ_CODE\", \"TARG_CODE\"]]\n",
-    "df = df.rename(columns={\"READ_CODE\":\"read3_code\", \"TARG_CODE\":\"opcs4_code\"})\n",
-    "df = df[~df[\"opcs4_code\"].str.match(\"^.*-.*$\")] #remove codes with '-'\n",
-    "df = df[~df[\"read3_code\"].str.match(\"^.*-.*$\")] #remove codes with '-'\n",
-    "df.to_parquet(\"maps/processed/read3_code_to_opcs4_code.parquet\", index=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5fe95638-1f25-45f3-803c-2fff74a2a4fd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#NHS Data Migrations\n",
-    "\n",
-    "#r2 only\n",
-    "# df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rctcremap_uk_20200401000001.txt', sep='\\t')\n",
-    "\n",
-    "#r3 only\n",
-    "# df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3cremap_uk_20200401000001.txt', sep='\\t')\n",
-    "\n",
-    "#snomed only\n",
-    "df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/sctcremap_uk_20200401000001.txt', sep='\\t')\n",
-    "df = df[[\"SCT_CONCEPTID\"]]\n",
-    "df = df.rename(columns={\"SCT_CONCEPTID\":\"snomed_code\"})\n",
-    "df = df.drop_duplicates()\n",
-    "df = df.astype(str)\n",
-    "df.to_parquet(\"maps/processed/snomed_code.parquet\", index=False)\n",
-    "\n",
-    "#r2 -> r3\n",
-    "df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rctctv3map_uk_20200401000001.txt', sep='\\t')\n",
-    "df = df[[\"V2_CONCEPTID\", \"CTV3_CONCEPTID\"]]\n",
-    "df = df.rename(columns={\"V2_CONCEPTID\":\"read2_code\",\n",
-    "                        \"CTV3_CONCEPTID\":\"read3_code\"})\n",
-    "df.to_parquet(\"maps/processed/read2_code_to_read3_code.parquet\", index=False)\n",
-    "\n",
-    "#r3->r2\n",
-    "df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3rctmap_uk_20200401000002.txt', sep='\\t')\n",
-    "df = df[[\"CTV3_CONCEPTID\", \"V2_CONCEPTID\"]]\n",
-    "df = df.rename(columns={\"CTV3_CONCEPTID\":\"read3_code\", \n",
-    "                        \"V2_CONCEPTID\":\"read2_code\"})\n",
-    "df = df.drop_duplicates()\n",
-    "df = df[~df[\"read2_code\"].str.match(\"^.*_.*$\")] #remove r2 codes with '_'\n",
-    "df.to_parquet(\"maps/processed/read3_code_to_read2_code.parquet\", index=False)\n",
-    "\n",
-    "\n",
-    "#r2 -> snomed\n",
-    "df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rcsctmap2_uk_20200401000001.txt', sep='\\t', dtype=str)\n",
-    "df = df[[\"ReadCode\", \"ConceptId\"]]\n",
-    "df = df.rename(columns={\"ReadCode\":\"read2_code\",\n",
-    "                        \"ConceptId\":\"snomed_code\"})\n",
-    "df.to_parquet(\"maps/processed/read2_code_to_snomed_code.parquet\", index=False)\n",
-    "\n",
-    "\n",
-    "#r3->snomed\n",
-    "df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3sctmap2_uk_20200401000001.txt', sep='\\t')\n",
-    "df = df[[\"CTV3_TERMID\", \"SCT_CONCEPTID\"]]\n",
-    "df = df.rename(columns={\"CTV3_TERMID\":\"read3_code\",\n",
-    "                        \"SCT_CONCEPTID\":\"snomed_code\"})\n",
-    "df[\"snomed_code\"] = df[\"snomed_code\"].astype(str)\n",
-    "df = df[~df[\"snomed_code\"].str.match(\"^.*_.*$\")] #remove snomed codes with '_'\n",
-    "df.to_parquet(\"maps/processed/read3_code_to_snomed_code.parquet\", index=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "267fa1cc-5159-48c4-9eee-19af5039d627",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#OPCS410 Data Files\n",
-    "df = pd.read_csv(\"maps/OPCS410 Data files txt/OPCS410 CodesAndTitles Nov 2022 V1.0.txt\", sep='\\t', dtype=str, header=None)\n",
-    "df = df.rename(columns={0:\"opcs4_code\", 1:\"description\"})\n",
-    "df.to_parquet(\"maps/processed/opcs4_code.parquet\", index=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "01d046fd-69af-44f3-acad-5d0edef3f745",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#ICD10_edition5\n",
-    "df = pd.read_xml(\"maps/ICD10_Edition5_XML_20160401/Content/ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml\",)\n",
-    "df = df[[\"CODE\", \"ALT_CODE\", \"DESCRIPTION\"]]\n",
-    "df = df.rename(columns={\"CODE\":\"icd10_code\",\n",
-    "                        \"ALT_CODE\":\"icd10_alt_code\",\n",
-    "                        \"DESCRIPTION\":\"description\"\n",
-    "                       })\n",
-    "df.to_parquet(\"maps/processed/icd10_code.parquet\", index=False)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "36630e24-f56c-48e1-8ecf-4ccd2b41eaea",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "code1=\"read2_code\"\n",
-    "code2=\"icd10_code\"\n",
-    "df_map = pd.read_parquet(f\"maps/processed/{code1}_to_{code2}.parquet\")\n",
-    "\n",
-    "codes=df_map[\"read2_code\"].iloc[:5]\n",
-    "\n",
-    "pd.merge(codes, df_map, how='left')[code2]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9787adeb-8507-488b-9a91-b8df3fbbe21e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#CPRD Code Browser\n",
-    "df = pd.read_csv('maps/CPRD_CodeBrowser_202211_Aurum/CPRDAurumMedical.txt', sep='\\t')\n",
-    "df = df[[\"MedCodeId\", \"CleansedReadCode\", \"SnomedCTConceptId\"]]\n",
-    "df = df.rename(columns={\"MedCodeId\":\"med_code\",\n",
-    "                   \"CleansedReadCode\":\"read2_code\",\n",
-    "                   \"SnomedCTConceptId\":\"snomed_code\"})\n",
-    "\n",
-    "# df = pd.read_csv('maps/CPRD_CodeBrowser_202211_Aurum/CPRDAurumProduct.txt', sep='\\t', dtype=str)\n",
-    "\n",
-    "# df = pd.read_csv('maps/CPRD_CodeBrowser_202211_GOLD/medical.txt', sep='\\t')\n",
-    "# df = df.reset_index().iloc[:,[1,6]]\n",
-    "# df = df.rename(columns={\"level_1\":\"read2_code\", \"20220523\":\"description\"})\n",
-    "# df = pd.read_csv('maps/CPRD_CodeBrowser_202211_GOLD/product.txt', sep='\\t', dtype=str) #CANNOT OPEN\n",
-    "\n",
-    "df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a968ffb1-4337-456b-8d20-419888b4044f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#BNF\n",
-    "\n",
-    "df = pd.read_excel(\"maps/BNF Snomed Mapping data 20231215.xlsx\")\n",
-    "df = df.astype(str)\n",
-    "df = df.rename(columns={\"BNF Code\":\"bnf_code\",\n",
-    "                        \"SNOMED Code\":\"snomed_code\"})\n",
-    "df[[\"bnf_code\", \"snomed_code\"]].to_parquet(\"maps/processed/bnf_code_to_snomed_code.parquet\", index=False)\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c70b1ce2-0f41-4d02-ad17-6fc44bc3c6bf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#BNF to Readv2 Merge\n",
-    "df1 = pd.read_parquet(\"maps/processed/bnf_code_to_snomed_code.parquet\").astype(str)\n",
-    "df2 = pd.read_parquet(\"maps/processed/read2_code_to_snomed_code.parquet\").astype(str)\n",
-    "# df1.merge(df2, how=\"inner\", on=\"snomed_code\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d5d34237-02d4-4dea-8c20-5adaf337f6b5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df1.merge(df2, how='inner', on='snomed_code')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b3166cf0-e4a5-43e0-aeac-78827427422e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    ".astype(str).dtypes"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c0a766f9-7959-4a10-b58f-cd946a878b60",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = pd.read_csv(\"../concepts/PHEN_summary_working.csv\")\n",
-    "cols = list(df.columns)\n",
-    "cols.remove('CONCEPT NAME ')\n",
-    "cols.remove('AGREED')\n",
-    "df = df.applymap(lambda x: str(x) if isinstance(x, (int, float)) else x) #change to int\n",
-    "\n",
-    "df_copy = df.rename(columns={\n",
-    "    \"CONCEPT NAME \":\"concept_set_name\",\n",
-    "    \"AGREED\":\"concept_set_status\"\n",
-    "})\n",
-    "df_copy[\"concept_set_status\"] = df_copy[\"concept_set_status\"].replace(\"USE\", \"AGREED\")\n",
-    "df_copy = df_copy[[\"concept_set_name\", \"concept_set_status\"]]\n",
-    "outs = df_copy.to_dict(orient='records')\n",
-    "\n",
-    "for i, out in enumerate(outs):\n",
-    "    out[\"metadata\"] = dict(df[cols].iloc[i])\n",
-    "    \n",
-    "json.dumps(outs)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8a204a95-dc4c-4183-9ea7-f5c5e95e9087",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5ce1ab58-50b4-4c22-b72b-c698de6830f7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f1ea81c6-d1db-408f-9d3a-b96f44efe21f",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5eb544a3-9dd1-41e8-88c2-a808646c6112",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true,
-    "tags": []
-   },
-   "source": [
-    "### OMOP Database"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c9e58e62-9e44-4d0c-9d8d-35c175c07e6c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import sqlite3\n",
-    "import csv\n",
-    "import pandas as pd\n",
-    "import os"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4f67c9a1-373f-4799-8a85-72767662d912",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d0ecdf69-ee90-42c1-ad25-d8357b603d1b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#IMPORT OMOP VOCABS\n",
-    "conn = sqlite3.connect(\"codes/omop_54.sqlite\") # change to 'sqlite:///your_filename.db'\n",
-    "folder_path = \"codes/vocabulary_download_v5_{9424944c-2b76-4127-8f05-f535e0f15e2a}_1731661390540\"\n",
-    "\n",
-    "# Check if the folder exists\n",
-    "if not os.path.isdir(folder_path):\n",
-    "    raise Exception(f\"Error: The folder '{folder_path}' does not exist.\")  \n",
-    "\n",
-    "# Iterate through files in the folder\n",
-    "for filename in os.listdir(folder_path):\n",
-    "    if filename.endswith(\".csv\"):  # Check if the file is a CSV\n",
-    "        file_path = os.path.join(folder_path, filename)\n",
-    "        try:\n",
-    "            print(f\"Reading file: {file_path}\")\n",
-    "            # Read the CSV file with the specified delimiter\n",
-    "            df = pd.read_csv(file_path, delimiter=\"\\t\", low_memory=False)\n",
-    "            table_name = os.path.splitext(os.path.basename(file_path))[0] #Get name of file\n",
-    "            \n",
-    "            #Export Table to sqlite db\n",
-    "            df.to_sql(table_name, conn, if_exists='replace', index=False)\n",
-    "            \n",
-    "        except Exception as e:\n",
-    "             raise Exception(f\"Error reading file {file_path}: {e}\")\n",
-    "\n",
-    "conn.commit()\n",
-    "conn.close()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b9cafd0c-a3bd-408b-bca8-b0de2acde1cd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create a SQL connection to our SQLite database\n",
-    "conn = sqlite3.connect(\"codes/omop_54.sqlite\")\n",
-    "cur = conn.cursor()\n",
-    "\n",
-    "#Print ALL Columns in Table\n",
-    "# table=\"CONCEPT_SET\"\n",
-    "# cur.execute(f\"PRAGMA table_info({table});\")\n",
-    "# print(pd.DataFrame(cur.fetchall()))\n",
-    "\n",
-    "#Print ALL TABLE NAMES\n",
-    "# cur.execute(\"SELECT name FROM sqlite_master WHERE type='table' AND name=? ;\", (\"VOCABULARY\",))\n",
-    "# print(cur.fetchone())\n",
-    "            \n",
-    "cur.execute(\"SELECT vocabulary_id FROM VOCABULARY WHERE vocabulary_id=? ;\", (\"MELDB\",))\n",
-    "print(cur.fetchone())\n",
-    "\n",
-    "    \n",
-    "    \n",
-    "#Print WHOLE TABLE\n",
-    "# cur.execute('SELECT * FROM CONCEPT;')\n",
-    "# cur.execute('SELECT * FROM CONCEPT WHERE standard_concept = \"C\";')\n",
-    "# cur.execute('SELECT * FROM CONCEPT WHERE concept_code = \"119768002\" LIMIT 1;')\n",
-    "# cur.execute('SELECT * FROM CONCEPT WHERE concept_code IN (\"119768002\", \"5905001\");')\n",
-    "# cur.execute('SELECT DISTINCT VOCABULARY_ID FROM CONCEPT;')\n",
-    "# df = pd.DataFrame(cur.fetchall())\n",
-    "# print(list(df[0]))\n",
-    "# display(df)\n",
-    "# for row in :\n",
-    "    # print(row)\n",
-    "\n",
-    "\n",
-    "\n",
-    "#Get Header of Table\n",
-    "# table=\"CONCEPT_CLASS\"\n",
-    "# cur.execute(f\"SELECT * FROM {table} LIMIT 3;\")\n",
-    "# print(cur.fetchall())\n",
-    "\n",
-    "#create meldb VOCABULARY\n",
-    "# meldb_version='v3.2.10'\n",
-    "# meldb_description = 'Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity'\n",
-    "# meldb_reference = 'https://www.it-innovation.soton.ac.uk/projects/meldb'\n",
-    "# df_test = pd.DataFrame([{\n",
-    "#     \"vocabulary_id\": 'MELDB',\n",
-    "#     \"vocabulary_name\": meldb_description,\n",
-    "#     \"vocabulary_reference\": meldb_reference,\n",
-    "#     \"vocabulary_version\": meldb_version,\n",
-    "#     # \"vocabulary_concept_id\": 0,\n",
-    "# }])\n",
-    "# df_test.to_sql(\"VOCABULARY\", conn, if_exists='append', index=False)\n",
-    "\n",
-    "\n",
-    "# cur.execute(\"\"\"\n",
-    "# CREATE TABLE CONCEPT_SET (\n",
-    "#     concept_set_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each concept set\n",
-    "#     atlas_id INTEGER,                                -- Unique identifier generated by ATLAS\n",
-    "#     concept_set_name TEXT,                           -- Optional name for the concept set\n",
-    "#     concept_set_description TEXT,                    -- Optional description for the concept set\n",
-    "#     vocabulary_id TEXT NOT NULL,                     -- Foreign key to VOCABULARY table\n",
-    "#     FOREIGN KEY (vocabulary_id) REFERENCES VOCABULARY(vocabulary_id)\n",
-    "# );\"\"\")\n",
-    "# cur.execute(\"DROP TABLE CONCEPT_SET;\")\n",
-    "\n",
-    "# cur.execute(\"\"\"\n",
-    "# CREATE TABLE CONCEPT_SET_ITEM (\n",
-    "#     concept_set_item_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each mapping\n",
-    "#     concept_set_id INTEGER NOT NULL,                      -- Foreign key to CONCEPT_SET table\n",
-    "#     concept_id INTEGER NOT NULL,                          -- Foreign key to CONCEPT table\n",
-    "#     FOREIGN KEY (concept_set_id) REFERENCES CONCEPT_SET(concept_set_id),\n",
-    "#     FOREIGN KEY (concept_id) REFERENCES CONCEPT(concept_id)\n",
-    "# );\"\"\")\n",
-    "# cur.execute(\"DROP TABLE CONCEPT_SET_ITEM;\")\n",
-    "\n",
-    "# Be sure to close the connection\n",
-    "conn.close()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d03b75f3-902f-42d7-b52f-dac7e79ecb11",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "conn = sqlite3.connect(\"codes/omop_54.sqlite\") # change to 'sqlite:///your_filename.db'\n",
-    "cur = conn.cursor()\n",
-    "\n",
-    "file_path = \"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/HEART_VALVE_DISORDERS.csv\"\n",
-    "df = pd.read_csv(file_path, low_memory=False)\n",
-    "df = df.set_index(\"code\")\n",
-    "\n",
-    "df.to_sql(name='test', con=conn, if_exists='replace')\n",
-    "\n",
-    "conn.commit()\n",
-    "conn.close()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d96c3511-3831-400e-ba40-0a36abcc60d3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#DISPLAY SQL TABLE\n",
-    "table=\"CONCEPT_SET_ITEM\"\n",
-    "\n",
-    "# Create a SQL connection to our SQLite database\n",
-    "conn = sqlite3.connect(\"codes/omop_54.sqlite\")\n",
-    "cur = conn.cursor()\n",
-    "\n",
-    "#Print ALL Columns in Table\n",
-    "cur.execute(f\"PRAGMA table_info({table});\")\n",
-    "df_cols = pd.DataFrame(cur.fetchall())\n",
-    "print(df_cols)\n",
-    "df_cols = df_cols[1]\n",
-    "\n",
-    "#Print TABLE\n",
-    "cur.execute(f\"SELECT * FROM {table};\")\n",
-    "df = pd.DataFrame(cur.fetchall())\n",
-    "df = df.rename(columns={i:s for i, s in enumerate(df_cols)})\n",
-    "display(df)\n",
-    "\n",
-    "conn.close()\n",
-    "\n",
-    "\n",
-    "# a+s = 13364 \n",
-    "# a+s+i = 13591\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "42d49a00-9646-4ba4-afb6-12297289b7a7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def sql_row_exist(conn, table, column, value):\n",
-    "\t# Execute and check if a result exists\n",
-    "\tcur = conn.cursor()\n",
-    "\tquery = f\"SELECT 1 FROM {table} WHERE {column} = ? LIMIT 1;\"\n",
-    "\tcur.execute(query, (value,))\n",
-    "\texists = cur.fetchone() is not None\n",
-    "\t\n",
-    "\treturn exists"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f7b51bcd-6ee1-4023-8d36-7f419ce4120d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#EXPORT MELDB CSV OUTPUT\n",
-    "\n",
-    "conn = sqlite3.connect(\"codes/omop_54.sqlite\") # change to 'sqlite:///your_filename.db'\n",
-    "cur = conn.cursor()\n",
-    "\n",
-    "vocab_output = \"MELDB\"\n",
-    "vocab_type = \"SNOMED\"\n",
-    "file_path = \"/home/jjd1c23/ssd/meldb/jjd1c23/phenotype/output/V3_2_10_MELD_snomed_no_translate.csv\"\n",
-    "# file_path = \"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/HEART_VALVE_DISORDERS.csv\"\n",
-    "\n",
-    "# Read the CSV file with the specified delimiter\n",
-    "out = pd.read_csv(file_path, low_memory=False)\n",
-    "print(df.columns)\n",
-    "\n",
-    "for concept_set_name, grp in out.groupby(\"MELDB_concept\"):\n",
-    "    # display(concept_set_name, grp[[\"code\", \"MELDB_concept\"]])\n",
-    "    \n",
-    "    #Create Concept_Set\n",
-    "    if not sql_row_exist(conn, \"CONCEPT_SET\", \"concept_set_name\", concept_set_name):\n",
-    "        cur.execute(f\"INSERT INTO CONCEPT_SET (concept_set_name, vocabulary_id) VALUES ('{concept_set_name}', 'MELDB');\")\n",
-    "    else:\n",
-    "        print(\"concept_set\", concept_set_name, \"already exists\")\n",
-    "        #TODO: ask to remove old concept_set?\n",
-    "    \n",
-    "    #Get Concept_set_Id\n",
-    "    query = \"SELECT concept_set_id FROM CONCEPT_SET WHERE concept_set_name = ? AND vocabulary_id = ?;\"\n",
-    "    cur.execute(query, (concept_set_name, vocab_output, ))    \n",
-    "    concept_set_id = cur.fetchone()[0]\n",
-    "    \n",
-    "    #Get corresponing Concept_id (OMOP) for each Concept_code (e.g. SNOMED)\n",
-    "    concept_codes = \"'\"+\"', '\".join(list(grp[\"code\"].astype(str)))+\"'\"\n",
-    "    query = f\"SELECT concept_id FROM CONCEPT WHERE vocabulary_id = ? AND concept_code IN ({concept_codes});\"\n",
-    "    print(query)\n",
-    "    cur.execute(query, (vocab_type, ))\n",
-    "    df_out = pd.DataFrame(cur.fetchall(), columns=[\"concept_id\"])\n",
-    "    \n",
-    "    if not len(grp) == len(df_out):\n",
-    "        print(\"ERROR: Some\", vocab_type, \"Codes do not exist in OMOP Database\")\n",
-    "    \n",
-    "    #Create Concept_set_item\n",
-    "    df_out[\"concept_set_id\"] = concept_set_id\n",
-    "    df_out.to_sql(\"CONCEPT_SET_ITEM\", conn, if_exists='append', index=False)\n",
-    "    \n",
-    "    display(df_out)\n",
-    "    \n",
-    "        \n",
-    "    \n",
-    "    # break\n",
-    "    \n",
-    "    \n",
-    "\n",
-    "#         #Create New CONCEPT_SET\n",
-    "#         table_name = os.path.splitext(os.path.basename(file_path))[0] #Get name of file\n",
-    "#         cur.execute(f\"INSERT INTO CONCEPT_SET (concept_class_name) VALUES ('{table_name}');\")\n",
-    "        \n",
-    "        \n",
-    "        \n",
-    "\n",
-    "        \n",
-    "        \n",
-    "\n",
-    "\n",
-    "conn.commit()\n",
-    "conn.close()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "85007741-e34c-4112-a63c-9fb302b76958",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\"'\"+\"', '\".join(list(grp[\"code\"].astype(str)))+\"'\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "423e7c21-f3bd-439d-9dcb-c17cc2cc6854",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true,
-    "tags": []
-   },
-   "source": [
-    "### ATLAS"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c6b45e4d-c7d2-42e7-9b4a-0e9c1c86d34b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Create ATLAS Concept Set\n",
-    "\n",
-    "def atlas_create_concept(name, description=\"\", items=[]):\n",
-    "    data={\n",
-    "      \"id\": 0,\n",
-    "      \"name\": name,\n",
-    "      \"description\": description,\n",
-    "      \"expression\": {\n",
-    "          \"items\":items \n",
-    "      }\n",
-    "    }\n",
-    "\n",
-    "    try:\n",
-    "        # Sending the POST request\n",
-    "        response = requests.post(url, json=data, headers=headers)\n",
-    "\n",
-    "        # Check the response status\n",
-    "        if response.status_code == 200 or response.status_code == 201:\n",
-    "            print(\"POST request successful:\")\n",
-    "            print(response.json())  # Assuming the response is JSON\n",
-    "            return response[\"id\"]\n",
-    "        else:\n",
-    "            print(f\"POST request failed. HTTP Status Code: {response.status_code}\")\n",
-    "            print(\"Response content:\")\n",
-    "            print(response.text)\n",
-    "            return None\n",
-    "\n",
-    "    except requests.exceptions.RequestException as e:\n",
-    "        print(f\"An error occurred: {e}\")\n",
-    "\n",
-    "# Heart Test 1 - 1885487\n",
-    "# Heart Test 2 - 1885488\n",
-    "# Heart Valve Disorders - 1885449\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "45497623-1da0-4f74-b21e-da8811c89b04",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_omop_concepts(cur, codes, vocab_id):    \n",
-    "    #Create List for SQL\n",
-    "    mask = \"\"\n",
-    "    for c in codes:\n",
-    "        mask+=f'\"{c}\", '\n",
-    "    mask = mask[:-2] #remove last comma\n",
-    "    \n",
-    "    #Execute SQL\n",
-    "    cur.execute(f'SELECT * FROM CONCEPT WHERE concept_code IN ({mask}) AND VOCABULARY_ID = \"{vocab_id}\";')\n",
-    "    df = pd.DataFrame(cur.fetchall()) #convert to pandas df\n",
-    "    \n",
-    "    print(\"Identified\", len(df[0]) ,\"OMOP Concepts:\", list(df[0]))\n",
-    "    \n",
-    "    return df\n",
-    "    \n",
-    "def omop_concepts_to_atlas_json(df):\n",
-    "    json = []\n",
-    "    for i, row in df.iterrows():\n",
-    "        #template for atlas api\n",
-    "        out = { \n",
-    "            \"concept\": {\n",
-    "                'CONCEPT_ID': row[0],\n",
-    "                'CONCEPT_NAME': row[1],\n",
-    "                'STANDARD_CONCEPT': 'S',\n",
-    "                'STANDARD_CONCEPT_CAPTION': 'Standard',\n",
-    "                'INVALID_REASON': 'V',\n",
-    "                'INVALID_REASON_CAPTION': 'Valid',\n",
-    "                'CONCEPT_CODE': row[6],\n",
-    "                'DOMAIN_ID': row[2],\n",
-    "                'VOCABULARY_ID': row[3],\n",
-    "                'CONCEPT_CLASS_ID': row[4],\n",
-    "                'VALID_START_DATE': int(row[7]),\n",
-    "                'VALID_END_DATE': int(row[8])\n",
-    "            },\n",
-    "            'isExcluded': False,\n",
-    "            'includeDescendants': False,\n",
-    "            'includeMapped': False\n",
-    "        }\n",
-    "        json.append(out)\n",
-    "    return json \n",
-    "\n",
-    "conn = sqlite3.connect(\"codes/omop_54.sqlite\")\n",
-    "cur = conn.cursor()\n",
-    "\n",
-    "vocab_id=\"SNOMED\" #SNOMED, ATC, ICD10CM, ICD9CM, Read\n",
-    "csv_output = \"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/ANGER.csv\"\n",
-    "\n",
-    "#Load CSV Output File\n",
-    "df_in = pd.read_csv(csv_output)\n",
-    "print(len(df_in))\n",
-    "\n",
-    "# df = get_omop_concepts(cur, [\"119768002\", \"5905001\"], \"SNOMED\")\n",
-    "df = get_omop_concepts(cur, list(df_in[\"code\"]), vocab_id)\n",
-    "json = omop_concepts_to_atlas_json(df)\n",
-    "# display(json)\n",
-    "\n",
-    "conn.close()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ea759907-c085-472a-82e2-07b6b19e2c8f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#ATLAS GET CONCEPT SET\n",
-    "import requests\n",
-    "\n",
-    "def request_get(url):\n",
-    "    try:\n",
-    "        # Sending the GET request\n",
-    "        response = requests.get(url)\n",
-    "\n",
-    "        # Check if the response status code is 200 (OK)\n",
-    "        if response.status_code == 200:\n",
-    "            print(\"Response data:\")\n",
-    "            # print(response.json())  # Assuming the response is in JSON format\n",
-    "            return response.json()\n",
-    "        else:\n",
-    "            print(f\"Failed to fetch data. HTTP Status Code: {response.status_code}\")\n",
-    "            print(\"Response content:\")\n",
-    "            print(response.text)\n",
-    "            return None\n",
-    "\n",
-    "    except requests.exceptions.RequestException as e:\n",
-    "        print(f\"An error occurred: {e}\")\n",
-    "\n",
-    "\n",
-    "#GET SET INFO\n",
-    "set_id = \"1885449\"\n",
-    "url = f\"https://atlas-demo.ohdsi.org/WebAPI/conceptset/{set_id}\"\n",
-    "request_get(url)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5a70e636-6051-4930-bf1b-30d093fd0552",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#GET SET ITEMS (Concepts)\n",
-    "set_id = \"1885449\"\n",
-    "url = f\"https://atlas-demo.ohdsi.org/WebAPI/conceptset/{set_id}/expression/ATLASPROD\"\n",
-    "response = request_get(url)\n",
-    "display(response)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "96bfcd9c-27e8-4be4-a680-7553d908790e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#ATLAS CREATE CONCEPT SET\n"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.7"
-  },
-  "toc-showtags": false
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
-%% Cell type:code id:8c8f4cdf-04a5-4762-895e-6555781a1f05 tags:
-
-``` python
-import pandas as pd
-import numpy as np
-import json
-```
-
-%% Cell type:markdown id:c5786d78-7dc2-4f02-ad21-cee95e473823 tags:
-
-### Ho generate JSON
-
-%% Cell type:code id:0292dc90-e31a-4724-8536-d0b55533aaef tags:
-
-``` python
-#List v4 to json
-
-df = pd.read_excel("PHEN_code_lists_sources_V4.xlsx", sheet_name="ho", dtype=str)
-# df = df.sort_values(by="mapped_condition")
-
-def json_file_template(file, cons, types, metadata):
-    concepts = ""
-    for concept in cons:
-        concepts += f'"{concept}", '
-    concepts = concepts[:-2] #remove last ,
-
-    type_str = ""
-    for k, v in types.items():
-        type_str += f'"{k}":"{v}", '
-    type_str = type_str[:-2]
-
-    meta_str = '"metadata":['
-    for v in metadata:
-        meta_str += f'"{v}", '
-    meta_str = meta_str[:-2]
-    meta_str = meta_str + "]"
-
-    return '''
-    {
-        \"file\":\"'''+file+'''",
-        \"columns\":{
-            '''+type_str+''',
-            '''+meta_str+'''
-        },
-        \"meldb_phenotypes\":['''+concepts+''']
-    },'''
-
-out = '"files":['
-folder = "codes/GitHub_TG_repository/"
-for file, grp in df.groupby("mapped_condition"):
-    file = file.replace("%20", " ")
-
-    for ext in ["_CPRD_GOLD.csv", "_CPRD_AURUM.csv", "_IMRD.csv"]:
-        path = file+"/"+file+ext
-        if os.path.isfile(folder+path):
-            out+= json_file_template(path, grp["meldb_condition"],
-                                     types={
-                                         "read2_code":"READ_CODE",
-                                         "snomed_code":"SNOMED_CT_CODE",
-                                         # "med_code":"MEDICAL_CODE_ID",
-                                     },
-                                     metadata = ["DESCRIPTION"]
-                                    )
-        else:
-            print("NOT FILE", folder+path)
-    for ext in ["_ICD10.csv"]:
-        path = file+"/"+file+ext
-        if os.path.isfile(folder+path):
-            out+= json_file_template(path, grp["meldb_condition"],
-                                     types={
-                                         "icd10_code":"READ_CODE",
-                                         "snomed_code":"SNOMED_CT_CODE",
-                                         # "icd10_code":"MEDICAL_CODE_ID",
-                                     },
-                                     metadata = ["DESCRIPTION"]
-                                    )
-        else:
-            print("NOT FILE", folder+path)
-
-
-
-    # out+= json_file_template(file+"/"+file+"_CPRD_AURUM.csv", grp["meldb_condition"])
-    # out+= json_file_template(file+"/"+file+"_ICD10.csv", grp["meldb_condition"])
-    # out+= json_file_template(file+"/"+file+"_IMRD.csv", grp["meldb_condition"])
-
-    # out += f' "{file}/{file}_CPRD_GOLD.csv":[{conds}],\n'
-    # out += f' "{file}/{file}_CPRD_AURUM.csv":[{conds}],\n'
-    # out += f' "{file}/{file}_ICD10.csv":[{conds}],\n'
-    # out += f' "{file}/{file}_IMRD.csv":[{conds}],\n'
-
-out = out[:-1] #remove last ,
-out += "\n]"
-out = out.replace("%20", " ")
-print(out)
-```
-
-%% Cell type:code id:f155b635-b459-4aff-81b2-e065fc223858 tags:
-
-``` python
-```
-
-%% Cell type:code id:d040eda5-4028-4047-834c-7315e307e415 tags:
-
-``` python
-df = pd.read_parquet("maps/processed/icd10_code.parquet")
-df
-```
-
-%% Cell type:code id:e0228ac9-8852-4818-b7f0-98429ca5229c tags:
-
-``` python
-code = ["A00.0", "*00.0"]
-code = pd.Series(code)
-print(code.isin(df["icd10_code"]))
-print(code.isin(df["icd10_alt_code"]))
-# print(  )
-~(
-    ~code.isin(df["icd10_code"])
-    &
-    ~code.isin(df["icd10_alt_code"])
-)
-```
-
-%% Cell type:markdown id:18efcacd-45f0-4341-86cc-d8e2e584350c tags:
-
-### Analyse the JSON file
-
-%% Cell type:code id:85dc197b-451e-4fa9-a53b-e6770c132123 tags:
-
-``` python
-import json
-import os
-
-path_json = "../concepts/PHEN_assign_v3.json"
-
-#Load JSON Concept Definitions
-mapping = json.load(open(path_json,'rb'))
-summary_config = mapping["concept_sets"]["concept_set"]
-summary_df = pd.DataFrame(summary_config) #change to dataframe
-
-summary_df = summary_df.join(pd.json_normalize(summary_df["metadata"])) #metadata to columns
-summary_df = summary_df.drop(columns=["metadata"])
-summary_df = summary_df.rename(columns={"concept_set_name":"CONCEPT_SET"})
-summary_df = summary_df.drop_duplicates() #remove duplicates
-
-summary_df
-```
-
-%% Cell type:code id:4c9b6b3f-08aa-4f61-b9b2-44a24b5d00a0 tags:
-
-``` python
-import json
-import os
-
-path_json = "PHEN_assign_v3.json"
-path_excel = "PHEN_summary_working.xlsx"
-path_codes = "codes/"
-
-#Get all Files in JSON
-def get_json_files(path_json):
-    folders = json.load(open(path_json,'rb'))
-    out = []
-    for folder in folders:
-        if "files" in folder:
-            for file in folder["files"]:
-                file_path = folder["folder"]+"/"+file["file"]
-                if "meldb_phenotypes" in file:
-                    for concept in file["meldb_phenotypes"]:
-                        out.append({"json_concept":concept, "filepath":file_path, "json_code_types":list(file["columns"].keys())})
-                elif "meldb_phenotypes_categories" in file:
-                    for code, concept in file["meldb_phenotypes_categories"].items():
-                        out.append({"json_concept":concept[0], "filepath":file_path, "json_code_types":list(file["columns"].keys())})
-                else:
-                    out.append({"json_concept":None, "filepath":file_path})
-
-    out = pd.DataFrame(out)
-    out["filepath"] = out["filepath"].astype(str)
-    return out
-out = get_json_files(path_json)
-
-#Get all Files Excel Summary
-def get_excel_files(path_excel):
-    path_excel = "PHEN_summary_working.xlsx"
-    out2 = pd.read_excel(path_excel)
-    out2 = out2[["CONCEPT NAME ", "CODING LIST", "AGREED", "FUNCTION"]].loc[1:] #select relevant columns
-
-    #Filter Concepts in use
-    out2 = out2[out2["AGREED"] == "USE"] #remove deprecated concepts
-    out2 = out2[out2["FUNCTION"] == "QUERY BY CODING LIST"] #remove deprecated concepts
-    out2 = out2.drop(['AGREED', 'FUNCTION'], axis=1)
-
-    #Get filepaths
-    out2["CODING LIST"] = out2["CODING LIST"].str.split(",") #split by ,
-    out2 = out2.explode("CODING LIST") #one row per file
-    out2["CODING LIST"] = out2["CODING LIST"].str.strip()
-    out2["CODING LIST"] = out2["CODING LIST"].str.replace("https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/", "")
-    out2["CODING LIST"] = out2["CODING LIST"].str.replace("%20", " ")
-
-    out2 = out2.rename(columns={"CONCEPT NAME ":"excel_concept", "CODING LIST":"filepath"})
-    return out2
-out2 = get_excel_files(path_excel)
-
-#Get all Files in /codes
-def get_code_files(path_codes):
-    all_files = []
-    for root, dirs, files in os.walk(path_codes, topdown=False):
-        for name in files:
-            if ".ipynb_checkpoint" not in root: #exclude notebook checkpoints
-                if name.endswith(".csv") or name.endswith(".xlsx") or name.endswith(".dta"): #exclude non-data files
-                    all_files.append(os.path.join(root, name))
-    all_files = pd.DataFrame(all_files)
-    all_files = all_files.rename(columns={0:"filepath"})
-    all_files["filepath"] = all_files["filepath"].astype(str)
-    return all_files
-all_files = get_code_files(path_codes)
-
-
-print("ALL FILES", len(all_files), len(all_files["filepath"].unique()))
-print("JSON CONCEPTS", len(out), len(out["filepath"].unique()))
-print("EXCEL CONCEPTS", len(out2), len(out2["filepath"].unique()))
-
-outs = pd.merge(all_files, out, how="outer", on="filepath")
-outs = pd.merge(outs, out2, how="outer", on="filepath")
-print(len(outs), len(outs["filepath"].unique()))
-outs.to_csv("output/MELD_file_to_concept.csv", index=False)
-
-# display(outs[ outs["concept"].isna()])
-
-# display(out )
-```
-
-%% Cell type:code id:f8e70c33-c869-46f8-953e-f6b52992cfbb tags:
-
-``` python
-display("JSON MISSING", outs[outs["json_concept"].isna() & outs["excel_concept"].notna()])
-display("EXCEL MISSING", outs[outs["json_concept"].notna() & outs["excel_concept"].isna()])
-```
-
-%% Cell type:code id:9d84465f-f064-4df2-b0e4-2dfb217aea21 tags:
-
-``` python
-f =  open('concepts-output/MELD-report.md', 'a') as f:
-    f.write(
-    """
-# Report
- One thing
- Two thing
- Three thing
-    """)
-```
-
-%% Cell type:code id:7f7fc771-e406-42c7-8a09-16a20b5298f5 tags:
-
-``` python
-total_length = 0
-for file in all_files["filepath"]:
-    if file.endswith(".csv"):
-        df_file = pd.read_csv(file)
-        total_length += len(df_file)
-    elif file.endswith(".xlsx"):
-        df_file = pd.read_excel(file)
-        total_length += len(df_file)
-    elif file.endswith(".dta"):
-        df_file = pd.read_stata(file)
-        total_length += len(df_file)
-total_length
-
-```
-
-%% Cell type:code id:08a9c565-28d6-46ee-9fa8-6fa0ee28a4d5 tags:
-
-``` python
-#turn filepaths into gitlab links
-outs2 = outs.copy()
-outs2["filepath"] = "https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/"+outs2["filepath"].str.replace(" ", "%20")
-
-#Groupby concepts and concat filepaths
-outs2 = outs2.groupby("concept")["filepath"].apply(', '.join).reset_index()
-outs2 = outs2.sort_values(by=["concept"])
-outs2
-outs2.to_csv("output/MELD_GitLab_link_to_concept.csv", index=False)
-```
-
-%% Cell type:markdown id:357bb84c-90c2-4b5f-95c0-443191783a7f tags:
-
-### Analyse Output Files
-
-%% Cell type:code id:7d3f9cb7-be86-4f1f-92f6-991094eb7bb7 tags:
-
-``` python
-version = "V2_2_2"
-output_files = [f"output/{version}_MELD_concepts_readv2.csv",
-                f"output/{version}_MELD_snomed_no_translate.csv",
-                f"output/{version}_MELD_icd10_no_translate.csv",
-                # f"output/{version}_MELD_med_no_translate.csv",
-                f"output/{version}_MELD_atc_no_translate.csv"
-               ]
-error_file = f"output/{version}_MELD_errors.csv"
-
-for output_file in output_files:
-    print("---"*3,output_file,"---"*3,)
-    df = pd.read_csv(output_file)
-    # df["MELDB_concept"].loc[df["CONCEPT TYPE"].isna()]
-    print("MELDB missing concepts ", len(df[df["CONCEPT TYPE"].isna()]))
-    if df["code"].dtype == "object":
-        print("Chars present:", np.sort(df["code"].apply(lambda x : set(x)).explode().unique()))
-
-# len(df["MELDB_concept"].unique())
-
-print("---"*3,error_file,"---"*3,)
-df = pd.read_csv(error_file)
-df = df.drop_duplicates()
-df["CODE_TYPE"].value_counts()
-# for i, row in df.drop_duplicates().iterrows():
-#     print(row["CODE"], row["CODE_TYPE"])
-```
-
-%% Cell type:code id:08e0ecc1-9271-48c3-9c5b-094800072906 tags:
-
-``` python
-def get_output_files(version):
-    output_files = [f"output/{version}_MELD_concepts_readv2.csv",
-                f"output/{version}_MELD_snomed_no_translate.csv",
-                f"output/{version}_MELD_icd10_no_translate.csv",
-                # f"output/{version}_MELD_med_no_translate.csv",
-                f"output/{version}_MELD_atc_no_translate.csv"
-               ]
-    error_file = f"output/{version}_MELD_errors.csv"
-    return output_files, error_file
-
-# version_1 = "V1_0_0"
-version_1 = "V2_1_4"
-version_2 = "V2_2_3"
-output1, err1 = get_output_files(version_1)
-output2, err2 = get_output_files(version_2)
-
-print("## Compare Concepts", version_1, "to", version_2)
-
-for out1, out2 in zip(output1, output2):
-    print(out1, out2 )
-    df1 = pd.read_csv(out1)
-    df1 = df1[["code","MELDB_concept"]].groupby("MELDB_concept").count()
-    df2 = pd.read_csv(out2)
-    df2 = df2[["code","MELDB_concept"]].groupby("MELDB_concept").count()
-
-    #Added/Removed Concepts
-    print("- Removed Concepts", list(set(df1.index) - set(df2.index)))
-    print("- Added Concepts",  list(set(df2.index) - set(df1.index)))
-
-    #Changed Concepts
-    diff = df2 - df1 #diff in counts
-    diff = diff[(~(diff["code"] == 0.0)) & diff["code"].notna()] #get non-zero counts
-    s = "\n"
-    for concept, row in diff.iterrows():
-        s += "\t - {} {}\n".format(concept, row["code"])
-    print("- Changed Concepts", s)
-
-
-# for output_file in output_files:
-#     print("---"*3,output_file,"---"*3,)
-#     df = pd.read_csv(output_file)
-#     # df["MELDB_concept"].loc[df["CONCEPT TYPE"].isna()]
-#     print("MELDB missing concepts ", len(df[df["CONCEPT TYPE"].isna()]))
-#     if df["code"].dtype == "object":
-#         print("Chars present:", np.sort(df["code"].apply(lambda x : set(x)).explode().unique()))
-
-```
-
-%% Cell type:code id:cc60c137-5a85-4155-af6b-6796f8c05980 tags:
-
-``` python
-import glob
-import os
-import pandas as pd
-
-df = pd.read_csv("/home/jjd1c23/ssd/meldb/jjd1c23/concepts/PHEN_summary_working.csv")
-df = df.set_index("#")
-
-for vocab in ["atc", "icd10", "readv2", "snomed"]:
-    df[vocab.upper()] = ""
-
-    for file in glob.glob(f"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/{vocab}/*.csv"):
-        concept_set = os.path.basename(file)[:-4]
-        row_index = df[df["CONCEPT NAME "] == concept_set].index[0]
-
-        df.loc[row_index, vocab.upper()] = "YES"
-
-df = df.drop(columns=["READv2_CODE", "ICD10_CODE"])
-df.to_csv("/home/jjd1c23/ssd/meldb/jjd1c23/concepts/PHEN_summary_working_labelled.csv")
-```
-
-%% Cell type:markdown id:e5c4291f-847b-4c82-976e-bd5b3a7b6bcc tags:
-
-###  Mappings
-
-%% Cell type:code id:08e34750-413c-469e-bcb8-e71bb188ff42 tags:
-
-``` python
-#NHS Read Browser
-import simpledbf
-import pandas as pd
-
-
-#r2 only
-df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ANCESTOR.DBF').to_dataframe()
-df = pd.concat([df['READCODE'], df['DESCENDANT']])
-df = pd.DataFrame(df.drop_duplicates())
-df = df.rename(columns={0:"read2_code"})
-df.to_parquet("maps/processed/read2_code.parquet", index=False)
-
-#r2 -> atc
-df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ATC.DBF').to_dataframe()
-df = df[["READCODE", "ATC"]]
-df = df.rename(columns={"READCODE":"read2_code", "ATC":"atc_code"})
-df.to_parquet("maps/processed/read2_code_to_atc_code.parquet", index=False)
-
-#r2 -> icd10
-df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ICD10.DBF').to_dataframe()
-df = df[["READ_CODE", "TARG_CODE"]]
-df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"icd10_code"})
-df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-'
-df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-'
-df.to_parquet("maps/processed/read2_code_to_icd10_code.parquet", index=False)
-
-#r2 -> opcs4
-df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/OPCS4V3.DBF').to_dataframe()
-df = df[["READ_CODE", "TARG_CODE"]]
-df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"opcs4_code"})
-df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-'
-df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-'
-df.to_parquet("maps/processed/read2_code_to_opcs4_code.parquet", index=False)
-
-#r3 only
-df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ANCESTOR.DBF').to_dataframe()
-df = pd.concat([df['READCODE'], df['DESCENDANT']])
-df = pd.DataFrame(df.drop_duplicates())
-df = df.rename(columns={0:"read3_code"})
-df.to_parquet("maps/processed/read3_code.parquet", index=False)
-
-#r3 -> icd10
-df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ICD10.DBF').to_dataframe()
-df = df[["READ_CODE", "TARG_CODE"]]
-df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"icd10_code"})
-df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-'
-df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-'
-df.to_parquet("maps/processed/read3_code_to_icd10_code.parquet", index=False)
-
-#r3 -> icd9
-# dbf = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ICD9V3.DBF')
-
-#r3 -> opcs4
-df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/OPCS4V3.DBF').to_dataframe()
-df = df[["READ_CODE", "TARG_CODE"]]
-df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"opcs4_code"})
-df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-'
-df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-'
-df.to_parquet("maps/processed/read3_code_to_opcs4_code.parquet", index=False)
-```
-
-%% Cell type:code id:5fe95638-1f25-45f3-803c-2fff74a2a4fd tags:
-
-``` python
-#NHS Data Migrations
-
-#r2 only
-# df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rctcremap_uk_20200401000001.txt', sep='\t')
-
-#r3 only
-# df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3cremap_uk_20200401000001.txt', sep='\t')
-
-#snomed only
-df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/sctcremap_uk_20200401000001.txt', sep='\t')
-df = df[["SCT_CONCEPTID"]]
-df = df.rename(columns={"SCT_CONCEPTID":"snomed_code"})
-df = df.drop_duplicates()
-df = df.astype(str)
-df.to_parquet("maps/processed/snomed_code.parquet", index=False)
-
-#r2 -> r3
-df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rctctv3map_uk_20200401000001.txt', sep='\t')
-df = df[["V2_CONCEPTID", "CTV3_CONCEPTID"]]
-df = df.rename(columns={"V2_CONCEPTID":"read2_code",
-                        "CTV3_CONCEPTID":"read3_code"})
-df.to_parquet("maps/processed/read2_code_to_read3_code.parquet", index=False)
-
-#r3->r2
-df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3rctmap_uk_20200401000002.txt', sep='\t')
-df = df[["CTV3_CONCEPTID", "V2_CONCEPTID"]]
-df = df.rename(columns={"CTV3_CONCEPTID":"read3_code",
-                        "V2_CONCEPTID":"read2_code"})
-df = df.drop_duplicates()
-df = df[~df["read2_code"].str.match("^.*_.*$")] #remove r2 codes with '_'
-df.to_parquet("maps/processed/read3_code_to_read2_code.parquet", index=False)
-
-
-#r2 -> snomed
-df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rcsctmap2_uk_20200401000001.txt', sep='\t', dtype=str)
-df = df[["ReadCode", "ConceptId"]]
-df = df.rename(columns={"ReadCode":"read2_code",
-                        "ConceptId":"snomed_code"})
-df.to_parquet("maps/processed/read2_code_to_snomed_code.parquet", index=False)
-
-
-#r3->snomed
-df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3sctmap2_uk_20200401000001.txt', sep='\t')
-df = df[["CTV3_TERMID", "SCT_CONCEPTID"]]
-df = df.rename(columns={"CTV3_TERMID":"read3_code",
-                        "SCT_CONCEPTID":"snomed_code"})
-df["snomed_code"] = df["snomed_code"].astype(str)
-df = df[~df["snomed_code"].str.match("^.*_.*$")] #remove snomed codes with '_'
-df.to_parquet("maps/processed/read3_code_to_snomed_code.parquet", index=False)
-```
-
-%% Cell type:code id:267fa1cc-5159-48c4-9eee-19af5039d627 tags:
-
-``` python
-#OPCS410 Data Files
-df = pd.read_csv("maps/OPCS410 Data files txt/OPCS410 CodesAndTitles Nov 2022 V1.0.txt", sep='\t', dtype=str, header=None)
-df = df.rename(columns={0:"opcs4_code", 1:"description"})
-df.to_parquet("maps/processed/opcs4_code.parquet", index=False)
-```
-
-%% Cell type:code id:01d046fd-69af-44f3-acad-5d0edef3f745 tags:
-
-``` python
-#ICD10_edition5
-df = pd.read_xml("maps/ICD10_Edition5_XML_20160401/Content/ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml",)
-df = df[["CODE", "ALT_CODE", "DESCRIPTION"]]
-df = df.rename(columns={"CODE":"icd10_code",
-                        "ALT_CODE":"icd10_alt_code",
-                        "DESCRIPTION":"description"
-                       })
-df.to_parquet("maps/processed/icd10_code.parquet", index=False)
-```
-
-%% Cell type:code id:36630e24-f56c-48e1-8ecf-4ccd2b41eaea tags:
-
-``` python
-code1="read2_code"
-code2="icd10_code"
-df_map = pd.read_parquet(f"maps/processed/{code1}_to_{code2}.parquet")
-
-codes=df_map["read2_code"].iloc[:5]
-
-pd.merge(codes, df_map, how='left')[code2]
-```
-
-%% Cell type:code id:9787adeb-8507-488b-9a91-b8df3fbbe21e tags:
-
-``` python
-#CPRD Code Browser
-df = pd.read_csv('maps/CPRD_CodeBrowser_202211_Aurum/CPRDAurumMedical.txt', sep='\t')
-df = df[["MedCodeId", "CleansedReadCode", "SnomedCTConceptId"]]
-df = df.rename(columns={"MedCodeId":"med_code",
-                   "CleansedReadCode":"read2_code",
-                   "SnomedCTConceptId":"snomed_code"})
-
-# df = pd.read_csv('maps/CPRD_CodeBrowser_202211_Aurum/CPRDAurumProduct.txt', sep='\t', dtype=str)
-
-# df = pd.read_csv('maps/CPRD_CodeBrowser_202211_GOLD/medical.txt', sep='\t')
-# df = df.reset_index().iloc[:,[1,6]]
-# df = df.rename(columns={"level_1":"read2_code", "20220523":"description"})
-# df = pd.read_csv('maps/CPRD_CodeBrowser_202211_GOLD/product.txt', sep='\t', dtype=str) #CANNOT OPEN
-
-df
-```
-
-%% Cell type:code id:a968ffb1-4337-456b-8d20-419888b4044f tags:
-
-``` python
-#BNF
-
-df = pd.read_excel("maps/BNF Snomed Mapping data 20231215.xlsx")
-df = df.astype(str)
-df = df.rename(columns={"BNF Code":"bnf_code",
-                        "SNOMED Code":"snomed_code"})
-df[["bnf_code", "snomed_code"]].to_parquet("maps/processed/bnf_code_to_snomed_code.parquet", index=False)
-
-```
-
-%% Cell type:code id:c70b1ce2-0f41-4d02-ad17-6fc44bc3c6bf tags:
-
-``` python
-#BNF to Readv2 Merge
-df1 = pd.read_parquet("maps/processed/bnf_code_to_snomed_code.parquet").astype(str)
-df2 = pd.read_parquet("maps/processed/read2_code_to_snomed_code.parquet").astype(str)
-# df1.merge(df2, how="inner", on="snomed_code")
-```
-
-%% Cell type:code id:d5d34237-02d4-4dea-8c20-5adaf337f6b5 tags:
-
-``` python
-df1.merge(df2, how='inner', on='snomed_code')
-```
-
-%% Cell type:code id:b3166cf0-e4a5-43e0-aeac-78827427422e tags:
-
-``` python
-.astype(str).dtypes
-```
-
-%% Cell type:code id:c0a766f9-7959-4a10-b58f-cd946a878b60 tags:
-
-``` python
-df = pd.read_csv("../concepts/PHEN_summary_working.csv")
-cols = list(df.columns)
-cols.remove('CONCEPT NAME ')
-cols.remove('AGREED')
-df = df.applymap(lambda x: str(x) if isinstance(x, (int, float)) else x) #change to int
-
-df_copy = df.rename(columns={
-    "CONCEPT NAME ":"concept_set_name",
-    "AGREED":"concept_set_status"
-})
-df_copy["concept_set_status"] = df_copy["concept_set_status"].replace("USE", "AGREED")
-df_copy = df_copy[["concept_set_name", "concept_set_status"]]
-outs = df_copy.to_dict(orient='records')
-
-for i, out in enumerate(outs):
-    out["metadata"] = dict(df[cols].iloc[i])
-
-json.dumps(outs)
-```
-
-%% Cell type:code id:8a204a95-dc4c-4183-9ea7-f5c5e95e9087 tags:
-
-``` python
-```
-
-%% Cell type:code id:5ce1ab58-50b4-4c22-b72b-c698de6830f7 tags:
-
-``` python
-import json
-```
-
-%% Cell type:code id:f1ea81c6-d1db-408f-9d3a-b96f44efe21f tags:
-
-``` python
-```
-
-%% Cell type:markdown id:5eb544a3-9dd1-41e8-88c2-a808646c6112 tags:
-
-### OMOP Database
-
-%% Cell type:code id:c9e58e62-9e44-4d0c-9d8d-35c175c07e6c tags:
-
-``` python
-import sqlite3
-import csv
-import pandas as pd
-import os
-```
-
-%% Cell type:code id:4f67c9a1-373f-4799-8a85-72767662d912 tags:
-
-``` python
-```
-
-%% Cell type:code id:d0ecdf69-ee90-42c1-ad25-d8357b603d1b tags:
-
-``` python
-#IMPORT OMOP VOCABS
-conn = sqlite3.connect("codes/omop_54.sqlite") # change to 'sqlite:///your_filename.db'
-folder_path = "codes/vocabulary_download_v5_{9424944c-2b76-4127-8f05-f535e0f15e2a}_1731661390540"
-
-# Check if the folder exists
-if not os.path.isdir(folder_path):
-    raise Exception(f"Error: The folder '{folder_path}' does not exist.")
-
-# Iterate through files in the folder
-for filename in os.listdir(folder_path):
-    if filename.endswith(".csv"):  # Check if the file is a CSV
-        file_path = os.path.join(folder_path, filename)
-        try:
-            print(f"Reading file: {file_path}")
-            # Read the CSV file with the specified delimiter
-            df = pd.read_csv(file_path, delimiter="\t", low_memory=False)
-            table_name = os.path.splitext(os.path.basename(file_path))[0] #Get name of file
-
-            #Export Table to sqlite db
-            df.to_sql(table_name, conn, if_exists='replace', index=False)
-
-        except Exception as e:
-             raise Exception(f"Error reading file {file_path}: {e}")
-
-conn.commit()
-conn.close()
-```
-
-%% Cell type:code id:b9cafd0c-a3bd-408b-bca8-b0de2acde1cd tags:
-
-``` python
-# Create a SQL connection to our SQLite database
-conn = sqlite3.connect("codes/omop_54.sqlite")
-cur = conn.cursor()
-
-#Print ALL Columns in Table
-# table="CONCEPT_SET"
-# cur.execute(f"PRAGMA table_info({table});")
-# print(pd.DataFrame(cur.fetchall()))
-
-#Print ALL TABLE NAMES
-# cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=? ;", ("VOCABULARY",))
-# print(cur.fetchone())
-
-cur.execute("SELECT vocabulary_id FROM VOCABULARY WHERE vocabulary_id=? ;", ("MELDB",))
-print(cur.fetchone())
-
-
-
-#Print WHOLE TABLE
-# cur.execute('SELECT * FROM CONCEPT;')
-# cur.execute('SELECT * FROM CONCEPT WHERE standard_concept = "C";')
-# cur.execute('SELECT * FROM CONCEPT WHERE concept_code = "119768002" LIMIT 1;')
-# cur.execute('SELECT * FROM CONCEPT WHERE concept_code IN ("119768002", "5905001");')
-# cur.execute('SELECT DISTINCT VOCABULARY_ID FROM CONCEPT;')
-# df = pd.DataFrame(cur.fetchall())
-# print(list(df[0]))
-# display(df)
-# for row in :
-    # print(row)
-
-
-
-#Get Header of Table
-# table="CONCEPT_CLASS"
-# cur.execute(f"SELECT * FROM {table} LIMIT 3;")
-# print(cur.fetchall())
-
-#create meldb VOCABULARY
-# meldb_version='v3.2.10'
-# meldb_description = 'Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity'
-# meldb_reference = 'https://www.it-innovation.soton.ac.uk/projects/meldb'
-# df_test = pd.DataFrame([{
-#     "vocabulary_id": 'MELDB',
-#     "vocabulary_name": meldb_description,
-#     "vocabulary_reference": meldb_reference,
-#     "vocabulary_version": meldb_version,
-#     # "vocabulary_concept_id": 0,
-# }])
-# df_test.to_sql("VOCABULARY", conn, if_exists='append', index=False)
-
-
-# cur.execute("""
-# CREATE TABLE CONCEPT_SET (
-#     concept_set_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each concept set
-#     atlas_id INTEGER,                                -- Unique identifier generated by ATLAS
-#     concept_set_name TEXT,                           -- Optional name for the concept set
-#     concept_set_description TEXT,                    -- Optional description for the concept set
-#     vocabulary_id TEXT NOT NULL,                     -- Foreign key to VOCABULARY table
-#     FOREIGN KEY (vocabulary_id) REFERENCES VOCABULARY(vocabulary_id)
-# );""")
-# cur.execute("DROP TABLE CONCEPT_SET;")
-
-# cur.execute("""
-# CREATE TABLE CONCEPT_SET_ITEM (
-#     concept_set_item_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each mapping
-#     concept_set_id INTEGER NOT NULL,                      -- Foreign key to CONCEPT_SET table
-#     concept_id INTEGER NOT NULL,                          -- Foreign key to CONCEPT table
-#     FOREIGN KEY (concept_set_id) REFERENCES CONCEPT_SET(concept_set_id),
-#     FOREIGN KEY (concept_id) REFERENCES CONCEPT(concept_id)
-# );""")
-# cur.execute("DROP TABLE CONCEPT_SET_ITEM;")
-
-# Be sure to close the connection
-conn.close()
-```
-
-%% Cell type:code id:d03b75f3-902f-42d7-b52f-dac7e79ecb11 tags:
-
-``` python
-conn = sqlite3.connect("codes/omop_54.sqlite") # change to 'sqlite:///your_filename.db'
-cur = conn.cursor()
-
-file_path = "/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/HEART_VALVE_DISORDERS.csv"
-df = pd.read_csv(file_path, low_memory=False)
-df = df.set_index("code")
-
-df.to_sql(name='test', con=conn, if_exists='replace')
-
-conn.commit()
-conn.close()
-```
-
-%% Cell type:code id:d96c3511-3831-400e-ba40-0a36abcc60d3 tags:
-
-``` python
-#DISPLAY SQL TABLE
-table="CONCEPT_SET_ITEM"
-
-# Create a SQL connection to our SQLite database
-conn = sqlite3.connect("codes/omop_54.sqlite")
-cur = conn.cursor()
-
-#Print ALL Columns in Table
-cur.execute(f"PRAGMA table_info({table});")
-df_cols = pd.DataFrame(cur.fetchall())
-print(df_cols)
-df_cols = df_cols[1]
-
-#Print TABLE
-cur.execute(f"SELECT * FROM {table};")
-df = pd.DataFrame(cur.fetchall())
-df = df.rename(columns={i:s for i, s in enumerate(df_cols)})
-display(df)
-
-conn.close()
-
-
-# a+s = 13364
-# a+s+i = 13591
-```
-
-%% Cell type:code id:42d49a00-9646-4ba4-afb6-12297289b7a7 tags:
-
-``` python
-def sql_row_exist(conn, table, column, value):
-	# Execute and check if a result exists
-	cur = conn.cursor()
-	query = f"SELECT 1 FROM {table} WHERE {column} = ? LIMIT 1;"
-	cur.execute(query, (value,))
-	exists = cur.fetchone() is not None
-
-	return exists
-```
-
-%% Cell type:code id:f7b51bcd-6ee1-4023-8d36-7f419ce4120d tags:
-
-``` python
-#EXPORT MELDB CSV OUTPUT
-
-conn = sqlite3.connect("codes/omop_54.sqlite") # change to 'sqlite:///your_filename.db'
-cur = conn.cursor()
-
-vocab_output = "MELDB"
-vocab_type = "SNOMED"
-file_path = "/home/jjd1c23/ssd/meldb/jjd1c23/phenotype/output/V3_2_10_MELD_snomed_no_translate.csv"
-# file_path = "/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/HEART_VALVE_DISORDERS.csv"
-
-# Read the CSV file with the specified delimiter
-out = pd.read_csv(file_path, low_memory=False)
-print(df.columns)
-
-for concept_set_name, grp in out.groupby("MELDB_concept"):
-    # display(concept_set_name, grp[["code", "MELDB_concept"]])
-
-    #Create Concept_Set
-    if not sql_row_exist(conn, "CONCEPT_SET", "concept_set_name", concept_set_name):
-        cur.execute(f"INSERT INTO CONCEPT_SET (concept_set_name, vocabulary_id) VALUES ('{concept_set_name}', 'MELDB');")
-    else:
-        print("concept_set", concept_set_name, "already exists")
-        #TODO: ask to remove old concept_set?
-
-    #Get Concept_set_Id
-    query = "SELECT concept_set_id FROM CONCEPT_SET WHERE concept_set_name = ? AND vocabulary_id = ?;"
-    cur.execute(query, (concept_set_name, vocab_output, ))
-    concept_set_id = cur.fetchone()[0]
-
-    #Get corresponing Concept_id (OMOP) for each Concept_code (e.g. SNOMED)
-    concept_codes = "'"+"', '".join(list(grp["code"].astype(str)))+"'"
-    query = f"SELECT concept_id FROM CONCEPT WHERE vocabulary_id = ? AND concept_code IN ({concept_codes});"
-    print(query)
-    cur.execute(query, (vocab_type, ))
-    df_out = pd.DataFrame(cur.fetchall(), columns=["concept_id"])
-
-    if not len(grp) == len(df_out):
-        print("ERROR: Some", vocab_type, "Codes do not exist in OMOP Database")
-
-    #Create Concept_set_item
-    df_out["concept_set_id"] = concept_set_id
-    df_out.to_sql("CONCEPT_SET_ITEM", conn, if_exists='append', index=False)
-
-    display(df_out)
-
-
-
-    # break
-
-
-
-#         #Create New CONCEPT_SET
-#         table_name = os.path.splitext(os.path.basename(file_path))[0] #Get name of file
-#         cur.execute(f"INSERT INTO CONCEPT_SET (concept_class_name) VALUES ('{table_name}');")
-
-
-
-
-
-
-
-
-conn.commit()
-conn.close()
-```
-
-%% Cell type:code id:85007741-e34c-4112-a63c-9fb302b76958 tags:
-
-``` python
-"'"+"', '".join(list(grp["code"].astype(str)))+"'"
-```
-
-%% Cell type:markdown id:423e7c21-f3bd-439d-9dcb-c17cc2cc6854 tags:
-
-### ATLAS
-
-%% Cell type:code id:c6b45e4d-c7d2-42e7-9b4a-0e9c1c86d34b tags:
-
-``` python
-#Create ATLAS Concept Set
-
-def atlas_create_concept(name, description="", items=[]):
-    data={
-      "id": 0,
-      "name": name,
-      "description": description,
-      "expression": {
-          "items":items
-      }
-    }
-
-    try:
-        # Sending the POST request
-        response = requests.post(url, json=data, headers=headers)
-
-        # Check the response status
-        if response.status_code == 200 or response.status_code == 201:
-            print("POST request successful:")
-            print(response.json())  # Assuming the response is JSON
-            return response["id"]
-        else:
-            print(f"POST request failed. HTTP Status Code: {response.status_code}")
-            print("Response content:")
-            print(response.text)
-            return None
-
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")
-
-# Heart Test 1 - 1885487
-# Heart Test 2 - 1885488
-# Heart Valve Disorders - 1885449
-
-```
-
-%% Cell type:code id:45497623-1da0-4f74-b21e-da8811c89b04 tags:
-
-``` python
-def get_omop_concepts(cur, codes, vocab_id):
-    #Create List for SQL
-    mask = ""
-    for c in codes:
-        mask+=f'"{c}", '
-    mask = mask[:-2] #remove last comma
-
-    #Execute SQL
-    cur.execute(f'SELECT * FROM CONCEPT WHERE concept_code IN ({mask}) AND VOCABULARY_ID = "{vocab_id}";')
-    df = pd.DataFrame(cur.fetchall()) #convert to pandas df
-
-    print("Identified", len(df[0]) ,"OMOP Concepts:", list(df[0]))
-
-    return df
-
-def omop_concepts_to_atlas_json(df):
-    json = []
-    for i, row in df.iterrows():
-        #template for atlas api
-        out = {
-            "concept": {
-                'CONCEPT_ID': row[0],
-                'CONCEPT_NAME': row[1],
-                'STANDARD_CONCEPT': 'S',
-                'STANDARD_CONCEPT_CAPTION': 'Standard',
-                'INVALID_REASON': 'V',
-                'INVALID_REASON_CAPTION': 'Valid',
-                'CONCEPT_CODE': row[6],
-                'DOMAIN_ID': row[2],
-                'VOCABULARY_ID': row[3],
-                'CONCEPT_CLASS_ID': row[4],
-                'VALID_START_DATE': int(row[7]),
-                'VALID_END_DATE': int(row[8])
-            },
-            'isExcluded': False,
-            'includeDescendants': False,
-            'includeMapped': False
-        }
-        json.append(out)
-    return json
-
-conn = sqlite3.connect("codes/omop_54.sqlite")
-cur = conn.cursor()
-
-vocab_id="SNOMED" #SNOMED, ATC, ICD10CM, ICD9CM, Read
-csv_output = "/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/ANGER.csv"
-
-#Load CSV Output File
-df_in = pd.read_csv(csv_output)
-print(len(df_in))
-
-# df = get_omop_concepts(cur, ["119768002", "5905001"], "SNOMED")
-df = get_omop_concepts(cur, list(df_in["code"]), vocab_id)
-json = omop_concepts_to_atlas_json(df)
-# display(json)
-
-conn.close()
-```
-
-%% Cell type:code id:ea759907-c085-472a-82e2-07b6b19e2c8f tags:
-
-``` python
-#ATLAS GET CONCEPT SET
-import requests
-
-def request_get(url):
-    try:
-        # Sending the GET request
-        response = requests.get(url)
-
-        # Check if the response status code is 200 (OK)
-        if response.status_code == 200:
-            print("Response data:")
-            # print(response.json())  # Assuming the response is in JSON format
-            return response.json()
-        else:
-            print(f"Failed to fetch data. HTTP Status Code: {response.status_code}")
-            print("Response content:")
-            print(response.text)
-            return None
-
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")
-
-
-#GET SET INFO
-set_id = "1885449"
-url = f"https://atlas-demo.ohdsi.org/WebAPI/conceptset/{set_id}"
-request_get(url)
-```
-
-%% Cell type:code id:5a70e636-6051-4930-bf1b-30d093fd0552 tags:
-
-``` python
-#GET SET ITEMS (Concepts)
-set_id = "1885449"
-url = f"https://atlas-demo.ohdsi.org/WebAPI/conceptset/{set_id}/expression/ATLASPROD"
-response = request_get(url)
-display(response)
-```
-
-%% Cell type:code id:96bfcd9c-27e8-4be4-a680-7553d908790e tags:
-
-``` python
-#ATLAS CREATE CONCEPT SET
-```
--- a/script/import.sh
+++ b/script/import.sh
-#! /usr/bin/bash
-
-echo "Removing Corrupted Files from Ho"
-rm codes/GitHub_TG_repository/lymphoma_prevalence_birm_cam/lymphoma_prevalence_birm_cam_ICD10.csv 
-rm codes/GitHub_TG_repository/Menieresdisease_birm_cam/Menieresdisease_birm_cam_ICD10.csv 
-rm codes/GitHub_TG_repository/peripheral_neuropathy_birm_cam/peripheral_neuropathy_birm_cam_ICD10.csv 
-rm codes/GitHub_TG_repository/Sjogrenssyndrome_Bham_CAM/Sjogrenssyndrome_Bham_CAM_ICD10.csv 
\ No newline at end of file
--- a/script/run.sh
+++ b/script/run.sh
-#! /usr/bin/bash
-
-version="V3_2_10"
-previous="V3_2_9"
-
-python main.py -r2 PHEN_assign_v3.json CONC_summary_working.xlsx
-mv output/MELD_concepts_read.csv output/${version}_MELD_concepts_readv2.csv
-
-python main.py -i PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate
-mv output/MELD_concepts_read.csv output/${version}_MELD_icd10_no_translate.csv
-
-python main.py -s PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate
-mv output/MELD_concepts_read.csv output/${version}_MELD_snomed_no_translate.csv
-
-# python main.py -o PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate
-# mv output/MELD_concepts_read.csv output/${version}_MELD_opcs4_no_translate.csv
-
-python main.py -a PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate
-mv output/MELD_concepts_read.csv output/${version}_MELD_atc_no_translate.csv
-
-# python main.py -m PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate
-# mv output/MELD_concepts_read.csv output/${version}_MELD_med_no_translate.csv
-
-mv output/MELD_errors.csv output/${version}_MELD_errors.csv
-
-
-#Generate Report
-rm concepts-output/MELD-report.md 
-python report.py PHEN_assign_v3.json CONC_summary_working.xlsx codes/ concepts-output/MELD-report.md ${version} ${previous}
-
-#Divide Concepts to Output Repo
-rm -rf concepts-output/readv2/*
-rm -rf concepts-output/icd10/*
-rm -rf concepts-output/snomed/*
-rm -rf concepts-output/atc/*
-python publish.py output/${version}_MELD_concepts_readv2.csv concepts-output/readv2/
-python publish.py output/${version}_MELD_icd10_no_translate.csv concepts-output/icd10/
-python publish.py output/${version}_MELD_snomed_no_translate.csv concepts-output/snomed/
-python publish.py output/${version}_MELD_atc_no_translate.csv concepts-output/atc/
-cp output/${version}_MELD_errors.csv concepts-output/${version}_MELD_errors.csv
-
-# Show Changes in Output repo (should be same as report)
-cd concepts-output
-git diff --stat
\ No newline at end of file