diff --git a/conda.yaml b/conda.yaml deleted file mode 100644 index a504a19865a00e2f53431c1a8d43da89385f95b3..0000000000000000000000000000000000000000 --- a/conda.yaml +++ /dev/null @@ -1,129 +0,0 @@ -name: acmc -channels: - - conda-forge -dependencies: - - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=2_gnu - - asttokens=3.0.0=pyhd8ed1ab_1 - - blosc=1.21.6=he440d0b_1 - - brotli-python=1.1.0=py313h46c70d0_2 - - bzip2=1.0.8=h4bc722e_7 - - c-ares=1.34.4=hb9d3cd8_0 - - c-blosc2=2.16.0=h3122c55_0 - - ca-certificates=2025.1.31=hbcca054_0 - - certifi=2025.1.31=pyhd8ed1ab_0 - - cffi=1.17.1=py313hfab6e84_0 - - charset-normalizer=3.4.1=pyhd8ed1ab_0 - - comm=0.2.2=pyhd8ed1ab_1 - - debugpy=1.8.12=py313h46c70d0_0 - - decorator=5.1.1=pyhd8ed1ab_1 - - exceptiongroup=1.2.2=pyhd8ed1ab_1 - - executing=2.1.0=pyhd8ed1ab_1 - - h2=4.2.0=pyhd8ed1ab_0 - - hdf5=1.14.3=nompi_h2d575fe_109 - - hpack=4.1.0=pyhd8ed1ab_0 - - hyperframe=6.1.0=pyhd8ed1ab_0 - - idna=3.10=pyhd8ed1ab_1 - - importlib-metadata=8.6.1=pyha770c72_0 - - ipykernel=6.29.5=pyh3099207_0 - - ipython=8.32.0=pyh907856f_0 - - jedi=0.19.2=pyhd8ed1ab_1 - - jupyter_client=8.6.3=pyhd8ed1ab_1 - - jupyter_core=5.7.2=pyh31011fe_1 - - keyutils=1.6.1=h166bdaf_0 - - krb5=1.21.3=h659f571_0 - - ld_impl_linux-64=2.43=h712a8e2_2 - - libaec=1.1.3=h59595ed_0 - - libblas=3.9.0=28_h59b9bed_openblas - - libcblas=3.9.0=28_he106b2a_openblas - - libcurl=8.11.1=h332b0f4_0 - - libedit=3.1.20250104=pl5321h7949ede_0 - - libev=4.33=hd590300_2 - - libexpat=2.6.4=h5888daf_0 - - libffi=3.4.6=h2dba641_0 - - libgcc=14.2.0=h77fa898_1 - - libgcc-ng=14.2.0=h69a702a_1 - - libgfortran=14.2.0=h69a702a_1 - - libgfortran5=14.2.0=hd5240d6_1 - - libgomp=14.2.0=h77fa898_1 - - liblapack=3.9.0=28_h7ac8fdf_openblas - - liblzma=5.6.4=hb9d3cd8_0 - - libmpdec=4.0.0=h4bc722e_0 - - libnghttp2=1.64.0=h161d5f1_0 - - libopenblas=0.3.28=pthreads_h94d23a6_1 - - libsodium=1.0.20=h4ab18f5_0 - - libsqlite=3.48.0=hee588c1_1 - - libssh2=1.11.1=hf672d98_0 - - libstdcxx=14.2.0=hc0a3c3a_1 - - libstdcxx-ng=14.2.0=h4852527_1 - - libuuid=2.38.1=h0b41bf4_0 - - libzlib=1.3.1=hb9d3cd8_2 - - lz4-c=1.10.0=h5888daf_1 - - matplotlib-inline=0.1.7=pyhd8ed1ab_1 - - ncurses=6.5=h2d0b736_3 - - nest-asyncio=1.6.0=pyhd8ed1ab_1 - - nomkl=1.0=h5ca1d4c_0 - - numexpr=2.10.2=py313h5f97788_100 - - numpy=2.2.3=py313h17eae1a_0 - - openssl=3.4.1=h7b32b05_0 - - packaging=24.2=pyhd8ed1ab_2 - - pandas=2.2.3=py313ha87cce1_1 - - parso=0.8.4=pyhd8ed1ab_1 - - pexpect=4.9.0=pyhd8ed1ab_1 - - pickleshare=0.7.5=pyhd8ed1ab_1004 - - pip=25.0.1=pyh145f28c_0 - - platformdirs=4.3.6=pyhd8ed1ab_1 - - prompt-toolkit=3.0.50=pyha770c72_0 - - psutil=6.1.1=py313h536fd9c_0 - - ptyprocess=0.7.0=pyhd8ed1ab_1 - - pure_eval=0.2.3=pyhd8ed1ab_1 - - py-cpuinfo=9.0.0=pyhd8ed1ab_1 - - pycparser=2.22=pyh29332c3_1 - - pygments=2.19.1=pyhd8ed1ab_0 - - pysocks=1.7.1=pyha55dd90_7 - - pytables=3.10.2=py313hd261420_1 - - python=3.13.1=ha99a958_105_cp313 - - python-dateutil=2.9.0.post0=pyhff2d567_1 - - python-tzdata=2025.1=pyhd8ed1ab_0 - - python_abi=3.13=5_cp313 - - pytz=2024.1=pyhd8ed1ab_0 - - pyzmq=26.2.1=py313h8e95178_0 - - readline=8.2=h8228510_1 - - requests=2.32.3=pyhd8ed1ab_1 - - six=1.17.0=pyhd8ed1ab_0 - - snappy=1.2.1=h8bd8927_1 - - stack_data=0.6.3=pyhd8ed1ab_1 - - tk=8.6.13=noxft_h4845f30_101 - - tornado=6.4.2=py313h536fd9c_0 - - traitlets=5.14.3=pyhd8ed1ab_1 - - typing-extensions=4.12.2=hd8ed1ab_1 - - typing_extensions=4.12.2=pyha770c72_1 - - tzdata=2025a=h78e105d_0 - - urllib3=2.3.0=pyhd8ed1ab_0 - - wcwidth=0.2.13=pyhd8ed1ab_1 - - zeromq=4.3.5=h3b0a872_7 - - zipp=3.21.0=pyhd8ed1ab_1 - - zlib-ng=2.2.4=h7955e40_0 - - zstandard=0.23.0=py313h80202fe_1 - - zstd=1.5.6=ha6fb4c9_0 - - pip: - - aiosqlite==0.21.0 - - click==8.1.8 - - cramjam==2.9.1 - - et-xmlfile==2.0.0 - - fastparquet==2024.11.0 - - fsspec==2025.2.0 - - gitdb==4.0.12 - - gitpython==3.1.44 - - greenlet==3.1.1 - - iniconfig==2.0.0 - - lxml==5.3.1 - - openpyxl==3.1.5 - - pluggy==1.5.0 - - pyarrow==19.0.0 - - pyomop==4.3.0 - - pytest==8.3.4 - - simpledbf==0.2.6 - - smmap==5.0.2 - - sqlalchemy==2.0.38 -prefix: /opt/conda/envs/acmc diff --git a/notebook/process_codes_WP.ipynb b/notebook/process_codes_WP.ipynb deleted file mode 100644 index 68e26be260fd3eece0d460dc54adf53d712bb8cc..0000000000000000000000000000000000000000 --- a/notebook/process_codes_WP.ipynb +++ /dev/null @@ -1,1288 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "8c8f4cdf-04a5-4762-895e-6555781a1f05", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import json" - ] - }, - { - "cell_type": "markdown", - "id": "c5786d78-7dc2-4f02-ad21-cee95e473823", - "metadata": { - "jp-MarkdownHeadingCollapsed": true, - "tags": [] - }, - "source": [ - "### Ho generate JSON" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0292dc90-e31a-4724-8536-d0b55533aaef", - "metadata": {}, - "outputs": [], - "source": [ - "#List v4 to json\n", - "\n", - "df = pd.read_excel(\"PHEN_code_lists_sources_V4.xlsx\", sheet_name=\"ho\", dtype=str)\n", - "# df = df.sort_values(by=\"mapped_condition\")\n", - "\n", - "def json_file_template(file, cons, types, metadata):\n", - " concepts = \"\"\n", - " for concept in cons:\n", - " concepts += f'\"{concept}\", '\n", - " concepts = concepts[:-2] #remove last ,\n", - " \n", - " type_str = \"\"\n", - " for k, v in types.items():\n", - " type_str += f'\"{k}\":\"{v}\", '\n", - " type_str = type_str[:-2]\n", - " \n", - " meta_str = '\"metadata\":['\n", - " for v in metadata:\n", - " meta_str += f'\"{v}\", '\n", - " meta_str = meta_str[:-2]\n", - " meta_str = meta_str + \"]\"\n", - " \n", - " return '''\n", - " { \n", - " \\\"file\\\":\\\"'''+file+'''\",\n", - " \\\"columns\\\":{\n", - " '''+type_str+''',\n", - " '''+meta_str+'''\n", - " },\n", - " \\\"meldb_phenotypes\\\":['''+concepts+''']\n", - " },'''\n", - "\n", - "out = '\"files\":['\n", - "folder = \"codes/GitHub_TG_repository/\"\n", - "for file, grp in df.groupby(\"mapped_condition\"):\n", - " file = file.replace(\"%20\", \" \") \n", - " \n", - " for ext in [\"_CPRD_GOLD.csv\", \"_CPRD_AURUM.csv\", \"_IMRD.csv\"]:\n", - " path = file+\"/\"+file+ext\n", - " if os.path.isfile(folder+path):\n", - " out+= json_file_template(path, grp[\"meldb_condition\"],\n", - " types={\n", - " \"read2_code\":\"READ_CODE\",\n", - " \"snomed_code\":\"SNOMED_CT_CODE\",\n", - " # \"med_code\":\"MEDICAL_CODE_ID\",\n", - " },\n", - " metadata = [\"DESCRIPTION\"]\n", - " )\n", - " else:\n", - " print(\"NOT FILE\", folder+path)\n", - " for ext in [\"_ICD10.csv\"]:\n", - " path = file+\"/\"+file+ext\n", - " if os.path.isfile(folder+path):\n", - " out+= json_file_template(path, grp[\"meldb_condition\"],\n", - " types={\n", - " \"icd10_code\":\"READ_CODE\",\n", - " \"snomed_code\":\"SNOMED_CT_CODE\",\n", - " # \"icd10_code\":\"MEDICAL_CODE_ID\",\n", - " },\n", - " metadata = [\"DESCRIPTION\"]\n", - " )\n", - " else:\n", - " print(\"NOT FILE\", folder+path)\n", - " \n", - " \n", - " \n", - " # out+= json_file_template(file+\"/\"+file+\"_CPRD_AURUM.csv\", grp[\"meldb_condition\"])\n", - " # out+= json_file_template(file+\"/\"+file+\"_ICD10.csv\", grp[\"meldb_condition\"])\n", - " # out+= json_file_template(file+\"/\"+file+\"_IMRD.csv\", grp[\"meldb_condition\"])\n", - "\n", - " # out += f' \"{file}/{file}_CPRD_GOLD.csv\":[{conds}],\\n'\n", - " # out += f' \"{file}/{file}_CPRD_AURUM.csv\":[{conds}],\\n'\n", - " # out += f' \"{file}/{file}_ICD10.csv\":[{conds}],\\n'\n", - " # out += f' \"{file}/{file}_IMRD.csv\":[{conds}],\\n'\n", - " \n", - "out = out[:-1] #remove last ,\n", - "out += \"\\n]\"\n", - "out = out.replace(\"%20\", \" \") \n", - "print(out)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f155b635-b459-4aff-81b2-e065fc223858", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d040eda5-4028-4047-834c-7315e307e415", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_parquet(\"maps/processed/icd10_code.parquet\")\n", - "df\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e0228ac9-8852-4818-b7f0-98429ca5229c", - "metadata": {}, - "outputs": [], - "source": [ - "code = [\"A00.0\", \"*00.0\"]\n", - "code = pd.Series(code)\n", - "print(code.isin(df[\"icd10_code\"]))\n", - "print(code.isin(df[\"icd10_alt_code\"]))\n", - "# print( )\n", - "~(\n", - " ~code.isin(df[\"icd10_code\"]) \n", - " &\n", - " ~code.isin(df[\"icd10_alt_code\"])\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "18efcacd-45f0-4341-86cc-d8e2e584350c", - "metadata": { - "jp-MarkdownHeadingCollapsed": true, - "tags": [] - }, - "source": [ - "### Analyse the JSON file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "85dc197b-451e-4fa9-a53b-e6770c132123", - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import os\n", - "\n", - "path_json = \"../concepts/PHEN_assign_v3.json\"\n", - "\n", - "#Load JSON Concept Definitions\n", - "mapping = json.load(open(path_json,'rb'))\n", - "summary_config = mapping[\"concept_sets\"][\"concept_set\"]\n", - "summary_df = pd.DataFrame(summary_config) #change to dataframe\n", - "\n", - "summary_df = summary_df.join(pd.json_normalize(summary_df[\"metadata\"])) #metadata to columns\n", - "summary_df = summary_df.drop(columns=[\"metadata\"])\n", - "summary_df = summary_df.rename(columns={\"concept_set_name\":\"CONCEPT_SET\"})\n", - "summary_df = summary_df.drop_duplicates() #remove duplicates\n", - " \n", - "summary_df\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4c9b6b3f-08aa-4f61-b9b2-44a24b5d00a0", - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import os\n", - "\n", - "path_json = \"PHEN_assign_v3.json\"\n", - "path_excel = \"PHEN_summary_working.xlsx\"\n", - "path_codes = \"codes/\"\n", - "\n", - "#Get all Files in JSON\n", - "def get_json_files(path_json):\n", - " folders = json.load(open(path_json,'rb'))\n", - " out = []\n", - " for folder in folders:\n", - " if \"files\" in folder:\n", - " for file in folder[\"files\"]:\n", - " file_path = folder[\"folder\"]+\"/\"+file[\"file\"]\n", - " if \"meldb_phenotypes\" in file:\n", - " for concept in file[\"meldb_phenotypes\"]:\n", - " out.append({\"json_concept\":concept, \"filepath\":file_path, \"json_code_types\":list(file[\"columns\"].keys())})\n", - " elif \"meldb_phenotypes_categories\" in file:\n", - " for code, concept in file[\"meldb_phenotypes_categories\"].items():\n", - " out.append({\"json_concept\":concept[0], \"filepath\":file_path, \"json_code_types\":list(file[\"columns\"].keys())})\n", - " else:\n", - " out.append({\"json_concept\":None, \"filepath\":file_path})\n", - "\n", - " out = pd.DataFrame(out)\n", - " out[\"filepath\"] = out[\"filepath\"].astype(str)\n", - " return out\n", - "out = get_json_files(path_json)\n", - "\n", - "#Get all Files Excel Summary\n", - "def get_excel_files(path_excel):\n", - " path_excel = \"PHEN_summary_working.xlsx\"\n", - " out2 = pd.read_excel(path_excel)\n", - " out2 = out2[[\"CONCEPT NAME \", \"CODING LIST\", \"AGREED\", \"FUNCTION\"]].loc[1:] #select relevant columns\n", - "\n", - " #Filter Concepts in use\n", - " out2 = out2[out2[\"AGREED\"] == \"USE\"] #remove deprecated concepts\n", - " out2 = out2[out2[\"FUNCTION\"] == \"QUERY BY CODING LIST\"] #remove deprecated concepts\n", - " out2 = out2.drop(['AGREED', 'FUNCTION'], axis=1)\n", - "\n", - " #Get filepaths\n", - " out2[\"CODING LIST\"] = out2[\"CODING LIST\"].str.split(\",\") #split by ,\n", - " out2 = out2.explode(\"CODING LIST\") #one row per file\n", - " out2[\"CODING LIST\"] = out2[\"CODING LIST\"].str.strip()\n", - " out2[\"CODING LIST\"] = out2[\"CODING LIST\"].str.replace(\"https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/\", \"\")\n", - " out2[\"CODING LIST\"] = out2[\"CODING LIST\"].str.replace(\"%20\", \" \")\n", - "\n", - " out2 = out2.rename(columns={\"CONCEPT NAME \":\"excel_concept\", \"CODING LIST\":\"filepath\"})\n", - " return out2\n", - "out2 = get_excel_files(path_excel)\n", - "\n", - "#Get all Files in /codes\n", - "def get_code_files(path_codes):\n", - " all_files = []\n", - " for root, dirs, files in os.walk(path_codes, topdown=False):\n", - " for name in files:\n", - " if \".ipynb_checkpoint\" not in root: #exclude notebook checkpoints\n", - " if name.endswith(\".csv\") or name.endswith(\".xlsx\") or name.endswith(\".dta\"): #exclude non-data files\n", - " all_files.append(os.path.join(root, name)) \n", - " all_files = pd.DataFrame(all_files)\n", - " all_files = all_files.rename(columns={0:\"filepath\"})\n", - " all_files[\"filepath\"] = all_files[\"filepath\"].astype(str)\n", - " return all_files\n", - "all_files = get_code_files(path_codes)\n", - "\n", - "\n", - "print(\"ALL FILES\", len(all_files), len(all_files[\"filepath\"].unique()))\n", - "print(\"JSON CONCEPTS\", len(out), len(out[\"filepath\"].unique()))\n", - "print(\"EXCEL CONCEPTS\", len(out2), len(out2[\"filepath\"].unique()))\n", - "\n", - "outs = pd.merge(all_files, out, how=\"outer\", on=\"filepath\")\n", - "outs = pd.merge(outs, out2, how=\"outer\", on=\"filepath\")\n", - "print(len(outs), len(outs[\"filepath\"].unique()))\n", - "outs.to_csv(\"output/MELD_file_to_concept.csv\", index=False)\n", - "\n", - "# display(outs[ outs[\"concept\"].isna()])\n", - "\n", - "# display(out ) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f8e70c33-c869-46f8-953e-f6b52992cfbb", - "metadata": {}, - "outputs": [], - "source": [ - "display(\"JSON MISSING\", outs[outs[\"json_concept\"].isna() & outs[\"excel_concept\"].notna()])\n", - "display(\"EXCEL MISSING\", outs[outs[\"json_concept\"].notna() & outs[\"excel_concept\"].isna()])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d84465f-f064-4df2-b0e4-2dfb217aea21", - "metadata": {}, - "outputs": [], - "source": [ - "f = open('concepts-output/MELD-report.md', 'a') as f:\n", - " f.write(\n", - " \"\"\"\n", - "# Report\n", - "- One thing\n", - "- Two thing\n", - "- Three thing\n", - " \"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7f7fc771-e406-42c7-8a09-16a20b5298f5", - "metadata": {}, - "outputs": [], - "source": [ - "total_length = 0\n", - "for file in all_files[\"filepath\"]:\n", - " if file.endswith(\".csv\"):\n", - " df_file = pd.read_csv(file)\n", - " total_length += len(df_file)\n", - " elif file.endswith(\".xlsx\"):\n", - " df_file = pd.read_excel(file)\n", - " total_length += len(df_file)\n", - " elif file.endswith(\".dta\"):\n", - " df_file = pd.read_stata(file)\n", - " total_length += len(df_file)\n", - "total_length\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "08a9c565-28d6-46ee-9fa8-6fa0ee28a4d5", - "metadata": {}, - "outputs": [], - "source": [ - "#turn filepaths into gitlab links\n", - "outs2 = outs.copy()\n", - "outs2[\"filepath\"] = \"https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/\"+outs2[\"filepath\"].str.replace(\" \", \"%20\")\n", - "\n", - "#Groupby concepts and concat filepaths\n", - "outs2 = outs2.groupby(\"concept\")[\"filepath\"].apply(', '.join).reset_index()\n", - "outs2 = outs2.sort_values(by=[\"concept\"])\n", - "outs2\n", - "outs2.to_csv(\"output/MELD_GitLab_link_to_concept.csv\", index=False)" - ] - }, - { - "cell_type": "markdown", - "id": "357bb84c-90c2-4b5f-95c0-443191783a7f", - "metadata": { - "jp-MarkdownHeadingCollapsed": true, - "tags": [] - }, - "source": [ - "### Analyse Output Files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7d3f9cb7-be86-4f1f-92f6-991094eb7bb7", - "metadata": {}, - "outputs": [], - "source": [ - "version = \"V2_2_2\"\n", - "output_files = [f\"output/{version}_MELD_concepts_readv2.csv\",\n", - " f\"output/{version}_MELD_snomed_no_translate.csv\",\n", - " f\"output/{version}_MELD_icd10_no_translate.csv\",\n", - " # f\"output/{version}_MELD_med_no_translate.csv\",\n", - " f\"output/{version}_MELD_atc_no_translate.csv\"\n", - " ]\n", - "error_file = f\"output/{version}_MELD_errors.csv\"\n", - "\n", - "for output_file in output_files:\n", - " print(\"---\"*3,output_file,\"---\"*3,)\n", - " df = pd.read_csv(output_file)\n", - " # df[\"MELDB_concept\"].loc[df[\"CONCEPT TYPE\"].isna()]\n", - " print(\"MELDB missing concepts \", len(df[df[\"CONCEPT TYPE\"].isna()]))\n", - " if df[\"code\"].dtype == \"object\":\n", - " print(\"Chars present:\", np.sort(df[\"code\"].apply(lambda x : set(x)).explode().unique()))\n", - " \n", - "# len(df[\"MELDB_concept\"].unique())\n", - "\n", - "print(\"---\"*3,error_file,\"---\"*3,)\n", - "df = pd.read_csv(error_file)\n", - "df = df.drop_duplicates()\n", - "df[\"CODE_TYPE\"].value_counts()\n", - "# for i, row in df.drop_duplicates().iterrows():\n", - "# print(row[\"CODE\"], row[\"CODE_TYPE\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "08e0ecc1-9271-48c3-9c5b-094800072906", - "metadata": {}, - "outputs": [], - "source": [ - "def get_output_files(version):\n", - " output_files = [f\"output/{version}_MELD_concepts_readv2.csv\",\n", - " f\"output/{version}_MELD_snomed_no_translate.csv\",\n", - " f\"output/{version}_MELD_icd10_no_translate.csv\",\n", - " # f\"output/{version}_MELD_med_no_translate.csv\",\n", - " f\"output/{version}_MELD_atc_no_translate.csv\"\n", - " ]\n", - " error_file = f\"output/{version}_MELD_errors.csv\"\n", - " return output_files, error_file\n", - "\n", - "# version_1 = \"V1_0_0\"\n", - "version_1 = \"V2_1_4\"\n", - "version_2 = \"V2_2_3\"\n", - "output1, err1 = get_output_files(version_1)\n", - "output2, err2 = get_output_files(version_2)\n", - "\n", - "print(\"## Compare Concepts\", version_1, \"to\", version_2)\n", - "\n", - "for out1, out2 in zip(output1, output2):\n", - " print(out1, out2 )\n", - " df1 = pd.read_csv(out1)\n", - " df1 = df1[[\"code\",\"MELDB_concept\"]].groupby(\"MELDB_concept\").count()\n", - " df2 = pd.read_csv(out2)\n", - " df2 = df2[[\"code\",\"MELDB_concept\"]].groupby(\"MELDB_concept\").count()\n", - " \n", - " #Added/Removed Concepts\n", - " print(\"- Removed Concepts\", list(set(df1.index) - set(df2.index)))\n", - " print(\"- Added Concepts\", list(set(df2.index) - set(df1.index)))\n", - " \n", - " #Changed Concepts\n", - " diff = df2 - df1 #diff in counts \n", - " diff = diff[(~(diff[\"code\"] == 0.0)) & diff[\"code\"].notna()] #get non-zero counts\n", - " s = \"\\n\"\n", - " for concept, row in diff.iterrows():\n", - " s += \"\\t - {} {}\\n\".format(concept, row[\"code\"])\n", - " print(\"- Changed Concepts\", s)\n", - "\n", - "\n", - "# for output_file in output_files:\n", - "# print(\"---\"*3,output_file,\"---\"*3,)\n", - "# df = pd.read_csv(output_file)\n", - "# # df[\"MELDB_concept\"].loc[df[\"CONCEPT TYPE\"].isna()]\n", - "# print(\"MELDB missing concepts \", len(df[df[\"CONCEPT TYPE\"].isna()]))\n", - "# if df[\"code\"].dtype == \"object\":\n", - "# print(\"Chars present:\", np.sort(df[\"code\"].apply(lambda x : set(x)).explode().unique()))\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cc60c137-5a85-4155-af6b-6796f8c05980", - "metadata": {}, - "outputs": [], - "source": [ - "import glob\n", - "import os\n", - "import pandas as pd\n", - "\n", - "df = pd.read_csv(\"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/PHEN_summary_working.csv\")\n", - "df = df.set_index(\"#\")\n", - "\n", - "for vocab in [\"atc\", \"icd10\", \"readv2\", \"snomed\"]:\n", - " df[vocab.upper()] = \"\"\n", - "\n", - " for file in glob.glob(f\"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/{vocab}/*.csv\"):\n", - " concept_set = os.path.basename(file)[:-4]\n", - " row_index = df[df[\"CONCEPT NAME \"] == concept_set].index[0]\n", - "\n", - " df.loc[row_index, vocab.upper()] = \"YES\"\n", - "\n", - "df = df.drop(columns=[\"READv2_CODE\", \"ICD10_CODE\"])\n", - "df.to_csv(\"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/PHEN_summary_working_labelled.csv\")" - ] - }, - { - "cell_type": "markdown", - "id": "e5c4291f-847b-4c82-976e-bd5b3a7b6bcc", - "metadata": { - "jp-MarkdownHeadingCollapsed": true, - "tags": [] - }, - "source": [ - "### Mappings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "08e34750-413c-469e-bcb8-e71bb188ff42", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "#NHS Read Browser\n", - "import simpledbf\n", - "import pandas as pd\n", - "\n", - "\n", - "#r2 only\n", - "df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ANCESTOR.DBF').to_dataframe()\n", - "df = pd.concat([df['READCODE'], df['DESCENDANT']])\n", - "df = pd.DataFrame(df.drop_duplicates())\n", - "df = df.rename(columns={0:\"read2_code\"})\n", - "df.to_parquet(\"maps/processed/read2_code.parquet\", index=False)\n", - "\n", - "#r2 -> atc\n", - "df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ATC.DBF').to_dataframe()\n", - "df = df[[\"READCODE\", \"ATC\"]]\n", - "df = df.rename(columns={\"READCODE\":\"read2_code\", \"ATC\":\"atc_code\"})\n", - "df.to_parquet(\"maps/processed/read2_code_to_atc_code.parquet\", index=False)\n", - "\n", - "#r2 -> icd10\n", - "df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ICD10.DBF').to_dataframe()\n", - "df = df[[\"READ_CODE\", \"TARG_CODE\"]]\n", - "df = df.rename(columns={\"READ_CODE\":\"read2_code\", \"TARG_CODE\":\"icd10_code\"})\n", - "df = df[~df[\"icd10_code\"].str.match(\"^.*-.*$\")] #remove codes with '-'\n", - "df = df[~df[\"read2_code\"].str.match(\"^.*-.*$\")] #remove codes with '-'\n", - "df.to_parquet(\"maps/processed/read2_code_to_icd10_code.parquet\", index=False)\n", - "\n", - "#r2 -> opcs4\n", - "df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/OPCS4V3.DBF').to_dataframe()\n", - "df = df[[\"READ_CODE\", \"TARG_CODE\"]]\n", - "df = df.rename(columns={\"READ_CODE\":\"read2_code\", \"TARG_CODE\":\"opcs4_code\"})\n", - "df = df[~df[\"opcs4_code\"].str.match(\"^.*-.*$\")] #remove codes with '-'\n", - "df = df[~df[\"read2_code\"].str.match(\"^.*-.*$\")] #remove codes with '-'\n", - "df.to_parquet(\"maps/processed/read2_code_to_opcs4_code.parquet\", index=False)\n", - "\n", - "#r3 only\n", - "df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ANCESTOR.DBF').to_dataframe()\n", - "df = pd.concat([df['READCODE'], df['DESCENDANT']])\n", - "df = pd.DataFrame(df.drop_duplicates())\n", - "df = df.rename(columns={0:\"read3_code\"})\n", - "df.to_parquet(\"maps/processed/read3_code.parquet\", index=False)\n", - "\n", - "#r3 -> icd10\n", - "df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ICD10.DBF').to_dataframe()\n", - "df = df[[\"READ_CODE\", \"TARG_CODE\"]]\n", - "df = df.rename(columns={\"READ_CODE\":\"read3_code\", \"TARG_CODE\":\"icd10_code\"})\n", - "df = df[~df[\"icd10_code\"].str.match(\"^.*-.*$\")] #remove codes with '-'\n", - "df = df[~df[\"read3_code\"].str.match(\"^.*-.*$\")] #remove codes with '-'\n", - "df.to_parquet(\"maps/processed/read3_code_to_icd10_code.parquet\", index=False)\n", - "\n", - "#r3 -> icd9\n", - "# dbf = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ICD9V3.DBF')\n", - "\n", - "#r3 -> opcs4\n", - "df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/OPCS4V3.DBF').to_dataframe()\n", - "df = df[[\"READ_CODE\", \"TARG_CODE\"]]\n", - "df = df.rename(columns={\"READ_CODE\":\"read3_code\", \"TARG_CODE\":\"opcs4_code\"})\n", - "df = df[~df[\"opcs4_code\"].str.match(\"^.*-.*$\")] #remove codes with '-'\n", - "df = df[~df[\"read3_code\"].str.match(\"^.*-.*$\")] #remove codes with '-'\n", - "df.to_parquet(\"maps/processed/read3_code_to_opcs4_code.parquet\", index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5fe95638-1f25-45f3-803c-2fff74a2a4fd", - "metadata": {}, - "outputs": [], - "source": [ - "#NHS Data Migrations\n", - "\n", - "#r2 only\n", - "# df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rctcremap_uk_20200401000001.txt', sep='\\t')\n", - "\n", - "#r3 only\n", - "# df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3cremap_uk_20200401000001.txt', sep='\\t')\n", - "\n", - "#snomed only\n", - "df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/sctcremap_uk_20200401000001.txt', sep='\\t')\n", - "df = df[[\"SCT_CONCEPTID\"]]\n", - "df = df.rename(columns={\"SCT_CONCEPTID\":\"snomed_code\"})\n", - "df = df.drop_duplicates()\n", - "df = df.astype(str)\n", - "df.to_parquet(\"maps/processed/snomed_code.parquet\", index=False)\n", - "\n", - "#r2 -> r3\n", - "df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rctctv3map_uk_20200401000001.txt', sep='\\t')\n", - "df = df[[\"V2_CONCEPTID\", \"CTV3_CONCEPTID\"]]\n", - "df = df.rename(columns={\"V2_CONCEPTID\":\"read2_code\",\n", - " \"CTV3_CONCEPTID\":\"read3_code\"})\n", - "df.to_parquet(\"maps/processed/read2_code_to_read3_code.parquet\", index=False)\n", - "\n", - "#r3->r2\n", - "df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3rctmap_uk_20200401000002.txt', sep='\\t')\n", - "df = df[[\"CTV3_CONCEPTID\", \"V2_CONCEPTID\"]]\n", - "df = df.rename(columns={\"CTV3_CONCEPTID\":\"read3_code\", \n", - " \"V2_CONCEPTID\":\"read2_code\"})\n", - "df = df.drop_duplicates()\n", - "df = df[~df[\"read2_code\"].str.match(\"^.*_.*$\")] #remove r2 codes with '_'\n", - "df.to_parquet(\"maps/processed/read3_code_to_read2_code.parquet\", index=False)\n", - "\n", - "\n", - "#r2 -> snomed\n", - "df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rcsctmap2_uk_20200401000001.txt', sep='\\t', dtype=str)\n", - "df = df[[\"ReadCode\", \"ConceptId\"]]\n", - "df = df.rename(columns={\"ReadCode\":\"read2_code\",\n", - " \"ConceptId\":\"snomed_code\"})\n", - "df.to_parquet(\"maps/processed/read2_code_to_snomed_code.parquet\", index=False)\n", - "\n", - "\n", - "#r3->snomed\n", - "df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3sctmap2_uk_20200401000001.txt', sep='\\t')\n", - "df = df[[\"CTV3_TERMID\", \"SCT_CONCEPTID\"]]\n", - "df = df.rename(columns={\"CTV3_TERMID\":\"read3_code\",\n", - " \"SCT_CONCEPTID\":\"snomed_code\"})\n", - "df[\"snomed_code\"] = df[\"snomed_code\"].astype(str)\n", - "df = df[~df[\"snomed_code\"].str.match(\"^.*_.*$\")] #remove snomed codes with '_'\n", - "df.to_parquet(\"maps/processed/read3_code_to_snomed_code.parquet\", index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "267fa1cc-5159-48c4-9eee-19af5039d627", - "metadata": {}, - "outputs": [], - "source": [ - "#OPCS410 Data Files\n", - "df = pd.read_csv(\"maps/OPCS410 Data files txt/OPCS410 CodesAndTitles Nov 2022 V1.0.txt\", sep='\\t', dtype=str, header=None)\n", - "df = df.rename(columns={0:\"opcs4_code\", 1:\"description\"})\n", - "df.to_parquet(\"maps/processed/opcs4_code.parquet\", index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "01d046fd-69af-44f3-acad-5d0edef3f745", - "metadata": {}, - "outputs": [], - "source": [ - "#ICD10_edition5\n", - "df = pd.read_xml(\"maps/ICD10_Edition5_XML_20160401/Content/ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml\",)\n", - "df = df[[\"CODE\", \"ALT_CODE\", \"DESCRIPTION\"]]\n", - "df = df.rename(columns={\"CODE\":\"icd10_code\",\n", - " \"ALT_CODE\":\"icd10_alt_code\",\n", - " \"DESCRIPTION\":\"description\"\n", - " })\n", - "df.to_parquet(\"maps/processed/icd10_code.parquet\", index=False)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "36630e24-f56c-48e1-8ecf-4ccd2b41eaea", - "metadata": {}, - "outputs": [], - "source": [ - "code1=\"read2_code\"\n", - "code2=\"icd10_code\"\n", - "df_map = pd.read_parquet(f\"maps/processed/{code1}_to_{code2}.parquet\")\n", - "\n", - "codes=df_map[\"read2_code\"].iloc[:5]\n", - "\n", - "pd.merge(codes, df_map, how='left')[code2]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9787adeb-8507-488b-9a91-b8df3fbbe21e", - "metadata": {}, - "outputs": [], - "source": [ - "#CPRD Code Browser\n", - "df = pd.read_csv('maps/CPRD_CodeBrowser_202211_Aurum/CPRDAurumMedical.txt', sep='\\t')\n", - "df = df[[\"MedCodeId\", \"CleansedReadCode\", \"SnomedCTConceptId\"]]\n", - "df = df.rename(columns={\"MedCodeId\":\"med_code\",\n", - " \"CleansedReadCode\":\"read2_code\",\n", - " \"SnomedCTConceptId\":\"snomed_code\"})\n", - "\n", - "# df = pd.read_csv('maps/CPRD_CodeBrowser_202211_Aurum/CPRDAurumProduct.txt', sep='\\t', dtype=str)\n", - "\n", - "# df = pd.read_csv('maps/CPRD_CodeBrowser_202211_GOLD/medical.txt', sep='\\t')\n", - "# df = df.reset_index().iloc[:,[1,6]]\n", - "# df = df.rename(columns={\"level_1\":\"read2_code\", \"20220523\":\"description\"})\n", - "# df = pd.read_csv('maps/CPRD_CodeBrowser_202211_GOLD/product.txt', sep='\\t', dtype=str) #CANNOT OPEN\n", - "\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a968ffb1-4337-456b-8d20-419888b4044f", - "metadata": {}, - "outputs": [], - "source": [ - "#BNF\n", - "\n", - "df = pd.read_excel(\"maps/BNF Snomed Mapping data 20231215.xlsx\")\n", - "df = df.astype(str)\n", - "df = df.rename(columns={\"BNF Code\":\"bnf_code\",\n", - " \"SNOMED Code\":\"snomed_code\"})\n", - "df[[\"bnf_code\", \"snomed_code\"]].to_parquet(\"maps/processed/bnf_code_to_snomed_code.parquet\", index=False)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c70b1ce2-0f41-4d02-ad17-6fc44bc3c6bf", - "metadata": {}, - "outputs": [], - "source": [ - "#BNF to Readv2 Merge\n", - "df1 = pd.read_parquet(\"maps/processed/bnf_code_to_snomed_code.parquet\").astype(str)\n", - "df2 = pd.read_parquet(\"maps/processed/read2_code_to_snomed_code.parquet\").astype(str)\n", - "# df1.merge(df2, how=\"inner\", on=\"snomed_code\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d5d34237-02d4-4dea-8c20-5adaf337f6b5", - "metadata": {}, - "outputs": [], - "source": [ - "df1.merge(df2, how='inner', on='snomed_code')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b3166cf0-e4a5-43e0-aeac-78827427422e", - "metadata": {}, - "outputs": [], - "source": [ - ".astype(str).dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c0a766f9-7959-4a10-b58f-cd946a878b60", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv(\"../concepts/PHEN_summary_working.csv\")\n", - "cols = list(df.columns)\n", - "cols.remove('CONCEPT NAME ')\n", - "cols.remove('AGREED')\n", - "df = df.applymap(lambda x: str(x) if isinstance(x, (int, float)) else x) #change to int\n", - "\n", - "df_copy = df.rename(columns={\n", - " \"CONCEPT NAME \":\"concept_set_name\",\n", - " \"AGREED\":\"concept_set_status\"\n", - "})\n", - "df_copy[\"concept_set_status\"] = df_copy[\"concept_set_status\"].replace(\"USE\", \"AGREED\")\n", - "df_copy = df_copy[[\"concept_set_name\", \"concept_set_status\"]]\n", - "outs = df_copy.to_dict(orient='records')\n", - "\n", - "for i, out in enumerate(outs):\n", - " out[\"metadata\"] = dict(df[cols].iloc[i])\n", - " \n", - "json.dumps(outs)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a204a95-dc4c-4183-9ea7-f5c5e95e9087", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ce1ab58-50b4-4c22-b72b-c698de6830f7", - "metadata": {}, - "outputs": [], - "source": [ - "import json" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1ea81c6-d1db-408f-9d3a-b96f44efe21f", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "5eb544a3-9dd1-41e8-88c2-a808646c6112", - "metadata": { - "jp-MarkdownHeadingCollapsed": true, - "tags": [] - }, - "source": [ - "### OMOP Database" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9e58e62-9e44-4d0c-9d8d-35c175c07e6c", - "metadata": {}, - "outputs": [], - "source": [ - "import sqlite3\n", - "import csv\n", - "import pandas as pd\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4f67c9a1-373f-4799-8a85-72767662d912", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d0ecdf69-ee90-42c1-ad25-d8357b603d1b", - "metadata": {}, - "outputs": [], - "source": [ - "#IMPORT OMOP VOCABS\n", - "conn = sqlite3.connect(\"codes/omop_54.sqlite\") # change to 'sqlite:///your_filename.db'\n", - "folder_path = \"codes/vocabulary_download_v5_{9424944c-2b76-4127-8f05-f535e0f15e2a}_1731661390540\"\n", - "\n", - "# Check if the folder exists\n", - "if not os.path.isdir(folder_path):\n", - " raise Exception(f\"Error: The folder '{folder_path}' does not exist.\") \n", - "\n", - "# Iterate through files in the folder\n", - "for filename in os.listdir(folder_path):\n", - " if filename.endswith(\".csv\"): # Check if the file is a CSV\n", - " file_path = os.path.join(folder_path, filename)\n", - " try:\n", - " print(f\"Reading file: {file_path}\")\n", - " # Read the CSV file with the specified delimiter\n", - " df = pd.read_csv(file_path, delimiter=\"\\t\", low_memory=False)\n", - " table_name = os.path.splitext(os.path.basename(file_path))[0] #Get name of file\n", - " \n", - " #Export Table to sqlite db\n", - " df.to_sql(table_name, conn, if_exists='replace', index=False)\n", - " \n", - " except Exception as e:\n", - " raise Exception(f\"Error reading file {file_path}: {e}\")\n", - "\n", - "conn.commit()\n", - "conn.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b9cafd0c-a3bd-408b-bca8-b0de2acde1cd", - "metadata": {}, - "outputs": [], - "source": [ - "# Create a SQL connection to our SQLite database\n", - "conn = sqlite3.connect(\"codes/omop_54.sqlite\")\n", - "cur = conn.cursor()\n", - "\n", - "#Print ALL Columns in Table\n", - "# table=\"CONCEPT_SET\"\n", - "# cur.execute(f\"PRAGMA table_info({table});\")\n", - "# print(pd.DataFrame(cur.fetchall()))\n", - "\n", - "#Print ALL TABLE NAMES\n", - "# cur.execute(\"SELECT name FROM sqlite_master WHERE type='table' AND name=? ;\", (\"VOCABULARY\",))\n", - "# print(cur.fetchone())\n", - " \n", - "cur.execute(\"SELECT vocabulary_id FROM VOCABULARY WHERE vocabulary_id=? ;\", (\"MELDB\",))\n", - "print(cur.fetchone())\n", - "\n", - " \n", - " \n", - "#Print WHOLE TABLE\n", - "# cur.execute('SELECT * FROM CONCEPT;')\n", - "# cur.execute('SELECT * FROM CONCEPT WHERE standard_concept = \"C\";')\n", - "# cur.execute('SELECT * FROM CONCEPT WHERE concept_code = \"119768002\" LIMIT 1;')\n", - "# cur.execute('SELECT * FROM CONCEPT WHERE concept_code IN (\"119768002\", \"5905001\");')\n", - "# cur.execute('SELECT DISTINCT VOCABULARY_ID FROM CONCEPT;')\n", - "# df = pd.DataFrame(cur.fetchall())\n", - "# print(list(df[0]))\n", - "# display(df)\n", - "# for row in :\n", - " # print(row)\n", - "\n", - "\n", - "\n", - "#Get Header of Table\n", - "# table=\"CONCEPT_CLASS\"\n", - "# cur.execute(f\"SELECT * FROM {table} LIMIT 3;\")\n", - "# print(cur.fetchall())\n", - "\n", - "#create meldb VOCABULARY\n", - "# meldb_version='v3.2.10'\n", - "# meldb_description = 'Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity'\n", - "# meldb_reference = 'https://www.it-innovation.soton.ac.uk/projects/meldb'\n", - "# df_test = pd.DataFrame([{\n", - "# \"vocabulary_id\": 'MELDB',\n", - "# \"vocabulary_name\": meldb_description,\n", - "# \"vocabulary_reference\": meldb_reference,\n", - "# \"vocabulary_version\": meldb_version,\n", - "# # \"vocabulary_concept_id\": 0,\n", - "# }])\n", - "# df_test.to_sql(\"VOCABULARY\", conn, if_exists='append', index=False)\n", - "\n", - "\n", - "# cur.execute(\"\"\"\n", - "# CREATE TABLE CONCEPT_SET (\n", - "# concept_set_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each concept set\n", - "# atlas_id INTEGER, -- Unique identifier generated by ATLAS\n", - "# concept_set_name TEXT, -- Optional name for the concept set\n", - "# concept_set_description TEXT, -- Optional description for the concept set\n", - "# vocabulary_id TEXT NOT NULL, -- Foreign key to VOCABULARY table\n", - "# FOREIGN KEY (vocabulary_id) REFERENCES VOCABULARY(vocabulary_id)\n", - "# );\"\"\")\n", - "# cur.execute(\"DROP TABLE CONCEPT_SET;\")\n", - "\n", - "# cur.execute(\"\"\"\n", - "# CREATE TABLE CONCEPT_SET_ITEM (\n", - "# concept_set_item_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each mapping\n", - "# concept_set_id INTEGER NOT NULL, -- Foreign key to CONCEPT_SET table\n", - "# concept_id INTEGER NOT NULL, -- Foreign key to CONCEPT table\n", - "# FOREIGN KEY (concept_set_id) REFERENCES CONCEPT_SET(concept_set_id),\n", - "# FOREIGN KEY (concept_id) REFERENCES CONCEPT(concept_id)\n", - "# );\"\"\")\n", - "# cur.execute(\"DROP TABLE CONCEPT_SET_ITEM;\")\n", - "\n", - "# Be sure to close the connection\n", - "conn.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d03b75f3-902f-42d7-b52f-dac7e79ecb11", - "metadata": {}, - "outputs": [], - "source": [ - "conn = sqlite3.connect(\"codes/omop_54.sqlite\") # change to 'sqlite:///your_filename.db'\n", - "cur = conn.cursor()\n", - "\n", - "file_path = \"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/HEART_VALVE_DISORDERS.csv\"\n", - "df = pd.read_csv(file_path, low_memory=False)\n", - "df = df.set_index(\"code\")\n", - "\n", - "df.to_sql(name='test', con=conn, if_exists='replace')\n", - "\n", - "conn.commit()\n", - "conn.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d96c3511-3831-400e-ba40-0a36abcc60d3", - "metadata": {}, - "outputs": [], - "source": [ - "#DISPLAY SQL TABLE\n", - "table=\"CONCEPT_SET_ITEM\"\n", - "\n", - "# Create a SQL connection to our SQLite database\n", - "conn = sqlite3.connect(\"codes/omop_54.sqlite\")\n", - "cur = conn.cursor()\n", - "\n", - "#Print ALL Columns in Table\n", - "cur.execute(f\"PRAGMA table_info({table});\")\n", - "df_cols = pd.DataFrame(cur.fetchall())\n", - "print(df_cols)\n", - "df_cols = df_cols[1]\n", - "\n", - "#Print TABLE\n", - "cur.execute(f\"SELECT * FROM {table};\")\n", - "df = pd.DataFrame(cur.fetchall())\n", - "df = df.rename(columns={i:s for i, s in enumerate(df_cols)})\n", - "display(df)\n", - "\n", - "conn.close()\n", - "\n", - "\n", - "# a+s = 13364 \n", - "# a+s+i = 13591\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "42d49a00-9646-4ba4-afb6-12297289b7a7", - "metadata": {}, - "outputs": [], - "source": [ - "def sql_row_exist(conn, table, column, value):\n", - "\t# Execute and check if a result exists\n", - "\tcur = conn.cursor()\n", - "\tquery = f\"SELECT 1 FROM {table} WHERE {column} = ? LIMIT 1;\"\n", - "\tcur.execute(query, (value,))\n", - "\texists = cur.fetchone() is not None\n", - "\t\n", - "\treturn exists" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f7b51bcd-6ee1-4023-8d36-7f419ce4120d", - "metadata": {}, - "outputs": [], - "source": [ - "#EXPORT MELDB CSV OUTPUT\n", - "\n", - "conn = sqlite3.connect(\"codes/omop_54.sqlite\") # change to 'sqlite:///your_filename.db'\n", - "cur = conn.cursor()\n", - "\n", - "vocab_output = \"MELDB\"\n", - "vocab_type = \"SNOMED\"\n", - "file_path = \"/home/jjd1c23/ssd/meldb/jjd1c23/phenotype/output/V3_2_10_MELD_snomed_no_translate.csv\"\n", - "# file_path = \"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/HEART_VALVE_DISORDERS.csv\"\n", - "\n", - "# Read the CSV file with the specified delimiter\n", - "out = pd.read_csv(file_path, low_memory=False)\n", - "print(df.columns)\n", - "\n", - "for concept_set_name, grp in out.groupby(\"MELDB_concept\"):\n", - " # display(concept_set_name, grp[[\"code\", \"MELDB_concept\"]])\n", - " \n", - " #Create Concept_Set\n", - " if not sql_row_exist(conn, \"CONCEPT_SET\", \"concept_set_name\", concept_set_name):\n", - " cur.execute(f\"INSERT INTO CONCEPT_SET (concept_set_name, vocabulary_id) VALUES ('{concept_set_name}', 'MELDB');\")\n", - " else:\n", - " print(\"concept_set\", concept_set_name, \"already exists\")\n", - " #TODO: ask to remove old concept_set?\n", - " \n", - " #Get Concept_set_Id\n", - " query = \"SELECT concept_set_id FROM CONCEPT_SET WHERE concept_set_name = ? AND vocabulary_id = ?;\"\n", - " cur.execute(query, (concept_set_name, vocab_output, )) \n", - " concept_set_id = cur.fetchone()[0]\n", - " \n", - " #Get corresponing Concept_id (OMOP) for each Concept_code (e.g. SNOMED)\n", - " concept_codes = \"'\"+\"', '\".join(list(grp[\"code\"].astype(str)))+\"'\"\n", - " query = f\"SELECT concept_id FROM CONCEPT WHERE vocabulary_id = ? AND concept_code IN ({concept_codes});\"\n", - " print(query)\n", - " cur.execute(query, (vocab_type, ))\n", - " df_out = pd.DataFrame(cur.fetchall(), columns=[\"concept_id\"])\n", - " \n", - " if not len(grp) == len(df_out):\n", - " print(\"ERROR: Some\", vocab_type, \"Codes do not exist in OMOP Database\")\n", - " \n", - " #Create Concept_set_item\n", - " df_out[\"concept_set_id\"] = concept_set_id\n", - " df_out.to_sql(\"CONCEPT_SET_ITEM\", conn, if_exists='append', index=False)\n", - " \n", - " display(df_out)\n", - " \n", - " \n", - " \n", - " # break\n", - " \n", - " \n", - "\n", - "# #Create New CONCEPT_SET\n", - "# table_name = os.path.splitext(os.path.basename(file_path))[0] #Get name of file\n", - "# cur.execute(f\"INSERT INTO CONCEPT_SET (concept_class_name) VALUES ('{table_name}');\")\n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "\n", - "\n", - "conn.commit()\n", - "conn.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "85007741-e34c-4112-a63c-9fb302b76958", - "metadata": {}, - "outputs": [], - "source": [ - "\"'\"+\"', '\".join(list(grp[\"code\"].astype(str)))+\"'\"" - ] - }, - { - "cell_type": "markdown", - "id": "423e7c21-f3bd-439d-9dcb-c17cc2cc6854", - "metadata": { - "jp-MarkdownHeadingCollapsed": true, - "tags": [] - }, - "source": [ - "### ATLAS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c6b45e4d-c7d2-42e7-9b4a-0e9c1c86d34b", - "metadata": {}, - "outputs": [], - "source": [ - "#Create ATLAS Concept Set\n", - "\n", - "def atlas_create_concept(name, description=\"\", items=[]):\n", - " data={\n", - " \"id\": 0,\n", - " \"name\": name,\n", - " \"description\": description,\n", - " \"expression\": {\n", - " \"items\":items \n", - " }\n", - " }\n", - "\n", - " try:\n", - " # Sending the POST request\n", - " response = requests.post(url, json=data, headers=headers)\n", - "\n", - " # Check the response status\n", - " if response.status_code == 200 or response.status_code == 201:\n", - " print(\"POST request successful:\")\n", - " print(response.json()) # Assuming the response is JSON\n", - " return response[\"id\"]\n", - " else:\n", - " print(f\"POST request failed. HTTP Status Code: {response.status_code}\")\n", - " print(\"Response content:\")\n", - " print(response.text)\n", - " return None\n", - "\n", - " except requests.exceptions.RequestException as e:\n", - " print(f\"An error occurred: {e}\")\n", - "\n", - "# Heart Test 1 - 1885487\n", - "# Heart Test 2 - 1885488\n", - "# Heart Valve Disorders - 1885449\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "45497623-1da0-4f74-b21e-da8811c89b04", - "metadata": {}, - "outputs": [], - "source": [ - "def get_omop_concepts(cur, codes, vocab_id): \n", - " #Create List for SQL\n", - " mask = \"\"\n", - " for c in codes:\n", - " mask+=f'\"{c}\", '\n", - " mask = mask[:-2] #remove last comma\n", - " \n", - " #Execute SQL\n", - " cur.execute(f'SELECT * FROM CONCEPT WHERE concept_code IN ({mask}) AND VOCABULARY_ID = \"{vocab_id}\";')\n", - " df = pd.DataFrame(cur.fetchall()) #convert to pandas df\n", - " \n", - " print(\"Identified\", len(df[0]) ,\"OMOP Concepts:\", list(df[0]))\n", - " \n", - " return df\n", - " \n", - "def omop_concepts_to_atlas_json(df):\n", - " json = []\n", - " for i, row in df.iterrows():\n", - " #template for atlas api\n", - " out = { \n", - " \"concept\": {\n", - " 'CONCEPT_ID': row[0],\n", - " 'CONCEPT_NAME': row[1],\n", - " 'STANDARD_CONCEPT': 'S',\n", - " 'STANDARD_CONCEPT_CAPTION': 'Standard',\n", - " 'INVALID_REASON': 'V',\n", - " 'INVALID_REASON_CAPTION': 'Valid',\n", - " 'CONCEPT_CODE': row[6],\n", - " 'DOMAIN_ID': row[2],\n", - " 'VOCABULARY_ID': row[3],\n", - " 'CONCEPT_CLASS_ID': row[4],\n", - " 'VALID_START_DATE': int(row[7]),\n", - " 'VALID_END_DATE': int(row[8])\n", - " },\n", - " 'isExcluded': False,\n", - " 'includeDescendants': False,\n", - " 'includeMapped': False\n", - " }\n", - " json.append(out)\n", - " return json \n", - "\n", - "conn = sqlite3.connect(\"codes/omop_54.sqlite\")\n", - "cur = conn.cursor()\n", - "\n", - "vocab_id=\"SNOMED\" #SNOMED, ATC, ICD10CM, ICD9CM, Read\n", - "csv_output = \"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/ANGER.csv\"\n", - "\n", - "#Load CSV Output File\n", - "df_in = pd.read_csv(csv_output)\n", - "print(len(df_in))\n", - "\n", - "# df = get_omop_concepts(cur, [\"119768002\", \"5905001\"], \"SNOMED\")\n", - "df = get_omop_concepts(cur, list(df_in[\"code\"]), vocab_id)\n", - "json = omop_concepts_to_atlas_json(df)\n", - "# display(json)\n", - "\n", - "conn.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea759907-c085-472a-82e2-07b6b19e2c8f", - "metadata": {}, - "outputs": [], - "source": [ - "#ATLAS GET CONCEPT SET\n", - "import requests\n", - "\n", - "def request_get(url):\n", - " try:\n", - " # Sending the GET request\n", - " response = requests.get(url)\n", - "\n", - " # Check if the response status code is 200 (OK)\n", - " if response.status_code == 200:\n", - " print(\"Response data:\")\n", - " # print(response.json()) # Assuming the response is in JSON format\n", - " return response.json()\n", - " else:\n", - " print(f\"Failed to fetch data. HTTP Status Code: {response.status_code}\")\n", - " print(\"Response content:\")\n", - " print(response.text)\n", - " return None\n", - "\n", - " except requests.exceptions.RequestException as e:\n", - " print(f\"An error occurred: {e}\")\n", - "\n", - "\n", - "#GET SET INFO\n", - "set_id = \"1885449\"\n", - "url = f\"https://atlas-demo.ohdsi.org/WebAPI/conceptset/{set_id}\"\n", - "request_get(url)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5a70e636-6051-4930-bf1b-30d093fd0552", - "metadata": {}, - "outputs": [], - "source": [ - "#GET SET ITEMS (Concepts)\n", - "set_id = \"1885449\"\n", - "url = f\"https://atlas-demo.ohdsi.org/WebAPI/conceptset/{set_id}/expression/ATLASPROD\"\n", - "response = request_get(url)\n", - "display(response)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "96bfcd9c-27e8-4be4-a680-7553d908790e", - "metadata": {}, - "outputs": [], - "source": [ - "#ATLAS CREATE CONCEPT SET\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - }, - "toc-showtags": false - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/script/import.sh b/script/import.sh deleted file mode 100644 index 909a27a3e82ff4aadc78db3fd049ac982f5617e9..0000000000000000000000000000000000000000 --- a/script/import.sh +++ /dev/null @@ -1,7 +0,0 @@ -#! /usr/bin/bash - -echo "Removing Corrupted Files from Ho" -rm codes/GitHub_TG_repository/lymphoma_prevalence_birm_cam/lymphoma_prevalence_birm_cam_ICD10.csv -rm codes/GitHub_TG_repository/Menieresdisease_birm_cam/Menieresdisease_birm_cam_ICD10.csv -rm codes/GitHub_TG_repository/peripheral_neuropathy_birm_cam/peripheral_neuropathy_birm_cam_ICD10.csv -rm codes/GitHub_TG_repository/Sjogrenssyndrome_Bham_CAM/Sjogrenssyndrome_Bham_CAM_ICD10.csv \ No newline at end of file diff --git a/script/run.sh b/script/run.sh deleted file mode 100644 index b64772d21a5e2476e544c0afd6826de0502df4aa..0000000000000000000000000000000000000000 --- a/script/run.sh +++ /dev/null @@ -1,44 +0,0 @@ -#! /usr/bin/bash - -version="V3_2_10" -previous="V3_2_9" - -python main.py -r2 PHEN_assign_v3.json CONC_summary_working.xlsx -mv output/MELD_concepts_read.csv output/${version}_MELD_concepts_readv2.csv - -python main.py -i PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate -mv output/MELD_concepts_read.csv output/${version}_MELD_icd10_no_translate.csv - -python main.py -s PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate -mv output/MELD_concepts_read.csv output/${version}_MELD_snomed_no_translate.csv - -# python main.py -o PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate -# mv output/MELD_concepts_read.csv output/${version}_MELD_opcs4_no_translate.csv - -python main.py -a PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate -mv output/MELD_concepts_read.csv output/${version}_MELD_atc_no_translate.csv - -# python main.py -m PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate -# mv output/MELD_concepts_read.csv output/${version}_MELD_med_no_translate.csv - -mv output/MELD_errors.csv output/${version}_MELD_errors.csv - - -#Generate Report -rm concepts-output/MELD-report.md -python report.py PHEN_assign_v3.json CONC_summary_working.xlsx codes/ concepts-output/MELD-report.md ${version} ${previous} - -#Divide Concepts to Output Repo -rm -rf concepts-output/readv2/* -rm -rf concepts-output/icd10/* -rm -rf concepts-output/snomed/* -rm -rf concepts-output/atc/* -python publish.py output/${version}_MELD_concepts_readv2.csv concepts-output/readv2/ -python publish.py output/${version}_MELD_icd10_no_translate.csv concepts-output/icd10/ -python publish.py output/${version}_MELD_snomed_no_translate.csv concepts-output/snomed/ -python publish.py output/${version}_MELD_atc_no_translate.csv concepts-output/atc/ -cp output/${version}_MELD_errors.csv concepts-output/${version}_MELD_errors.csv - -# Show Changes in Output repo (should be same as report) -cd concepts-output -git diff --stat \ No newline at end of file