From 4f78ba8595a9de3eb5ca296dd87237ecfa503f15 Mon Sep 17 00:00:00 2001 From: Jakub Dylag <jjd1c23@soton.ac.uk> Date: Fri, 4 Apr 2025 10:58:55 +0100 Subject: [PATCH] Convertion script - allow multiple files per concept set --- convert.ipynb | 335 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 272 insertions(+), 63 deletions(-) diff --git a/convert.ipynb b/convert.ipynb index a600daf..a136f4b 100644 --- a/convert.ipynb +++ b/convert.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 5, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -14,110 +14,319 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n", + "split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n", + "split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n", + "split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n", + "split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n", + "divide_col 13 {'divide_col': 'MMCode'}\n", + "divide_col 22 {'divide_col': 'MMCode'}\n", + "divide_col 5 {'divide_col': 'MMCode'}\n", + "divide_col 33 {'divide_col': 'MMCode'}\n", + "divide_col 37 {'divide_col': 'MMCode'}\n", + "divide_col 41 {'divide_col': 'MMCode'}\n", + "divide_col 34 {'divide_col': 'MMCode'}\n", + "divide_col 12 {'divide_col': 'MMCode'}\n", + "divide_col 6 {'divide_col': 'MMCode'}\n", + "divide_col 11 {'divide_col': 'MMCode'}\n", + "divide_col 28 {'divide_col': 'MMCode'}\n", + "divide_col 3 {'divide_col': 'MMCode'}\n", + "divide_col 21 {'divide_col': 'MMCode'}\n", + "divide_col 16 {'divide_col': 'MMCode'}\n", + "divide_col 17 {'divide_col': 'MMCode'}\n", + "divide_col 36 {'divide_col': 'MMCode'}\n", + "divide_col 27 {'divide_col': 'MMCode'}\n", + "divide_col 26 {'divide_col': 'MMCode'}\n", + "divide_col 24 {'divide_col': 'MMCode'}\n", + "divide_col 2 {'divide_col': 'MMCode'}\n", + "divide_col 31 {'divide_col': 'MMCode'}\n", + "divide_col 14 {'divide_col': 'MMCode'}\n", + "divide_col 35 {'divide_col': 'MMCode'}\n", + "divide_col 39 {'divide_col': 'MMCode'}\n", + "divide_col 38 {'divide_col': 'MMCode'}\n", + "divide_col 25 {'divide_col': 'MMCode'}\n", + "divide_col 23 {'divide_col': 'MMCode'}\n", + "divide_col 19 {'divide_col': 'MMCode'}\n", + "divide_col 40 {'divide_col': 'MMCode'}\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>name</th>\n", + " <th>files</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>ALL_MEDICATIONS</td>\n", + " <td>{'path': 'Medication code source/WP02_SAIL_WIL...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>PAIN_MEDICATIONS_IF_NO_EPILEPSY_DIAGNOSIS</td>\n", + " <td>{'path': 'Medication code source/Pain medicati...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>PAIN_MEDICATIONS</td>\n", + " <td>{'path': 'Medication code source/Pain medicati...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>ABDO_PAIN</td>\n", + " <td>{'path': 'ClinicalCodes.org from the Universit...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>CVD_EVENTS</td>\n", + " <td>{'path': 'ClinicalCodes.org from the Universit...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>481</th>\n", + " <td>SLEEP_PROBLEMS</td>\n", + " <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/SLEEP_...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>482</th>\n", + " <td>SWEATING</td>\n", + " <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/SWEATI...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>483</th>\n", + " <td>TIREDNESS</td>\n", + " <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/TIREDN...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>484</th>\n", + " <td>UNINTENTIONAL_WEIGHT_LOSS</td>\n", + " <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/UNINTE...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>485</th>\n", + " <td>URINARY_INCONTINENCE</td>\n", + " <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/URINAR...</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>486 rows × 2 columns</p>\n", + "</div>" + ], + "text/plain": [ + " name \\\n", + "0 ALL_MEDICATIONS \n", + "1 PAIN_MEDICATIONS_IF_NO_EPILEPSY_DIAGNOSIS \n", + "2 PAIN_MEDICATIONS \n", + "3 ABDO_PAIN \n", + "4 CVD_EVENTS \n", + ".. ... \n", + "481 SLEEP_PROBLEMS \n", + "482 SWEATING \n", + "483 TIREDNESS \n", + "484 UNINTENTIONAL_WEIGHT_LOSS \n", + "485 URINARY_INCONTINENCE \n", + "\n", + " files \n", + "0 {'path': 'Medication code source/WP02_SAIL_WIL... \n", + "1 {'path': 'Medication code source/Pain medicati... \n", + "2 {'path': 'Medication code source/Pain medicati... \n", + "3 {'path': 'ClinicalCodes.org from the Universit... \n", + "4 {'path': 'ClinicalCodes.org from the Universit... \n", + ".. ... \n", + "481 {'path': 'NEW BURDEN CODELISTS/Symptoms/SLEEP_... \n", + "482 {'path': 'NEW BURDEN CODELISTS/Symptoms/SWEATI... \n", + "483 {'path': 'NEW BURDEN CODELISTS/Symptoms/TIREDN... \n", + "484 {'path': 'NEW BURDEN CODELISTS/Symptoms/UNINTE... \n", + "485 {'path': 'NEW BURDEN CODELISTS/Symptoms/URINAR... \n", + "\n", + "[486 rows x 2 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "165 in yaml\n" + ] + } + ], "source": [ "json_file = \"PHEN_assign_v3.json\"\n", - "yaml_path = \"workspace/phen/config.yml\"\n", - "source_folder_path = \"workspace/phen/concepts\"\n", - "outs = {}\n", + "yaml_path = \"config.yml\"\n", + "source_folder_path = \"concepts\"\n", + "outs = []\n", "\n", "# Read the JSON file\n", "with open(json_file, 'r', encoding='utf-8') as file:\n", " data = json.load(file)\n", "\n", - "def add_conc(outs, name, path, columns, category=None, metadata=None):\n", + "def add_conc(outs, name, path, columns, category=None, actions=None, #metacol=None\n", + "):\n", + " #TODO: acmc handle empty conceptset when all QA fail \n", " if name == \"PLASMACELL\":\n", " return outs\n", - " \n", + "\n", " out = {\n", " \"name\":str(name),\n", - " \"file\":{\n", - " \"path\":str(path),\n", + " \"files\":{\n", + " \"path\":str(path).replace(\"\\\\\", '/'),\n", " \"columns\":columns,\n", " },\n", " }\n", - " if category is not None:\n", - " out[\"file\"][\"category\"]=str(category)\n", - " if metadata is not None:\n", - " out[\"metadata\"]=metadata\n", + " #divide_col\n", + " if (category is not None) and (actions is not None):\n", + " print(\"divide_col\", category, actions)\n", + " out[\"files\"][\"category\"]=str(category)\n", + " out[\"files\"][\"actions\"] = {}\n", + " out[\"files\"][\"actions\"][\"divide_col\"] = actions[\"divide_col\"]\n", + " #split_col\n", + " elif (actions is not None):\n", + " print(\"split_col\", actions)\n", + " out[\"files\"][\"actions\"] = {}\n", + " out[\"files\"][\"actions\"][\"split_col\"] = actions[\"split_col\"]\n", + " out[\"files\"][\"actions\"][\"codes_col\"] = actions[\"codes_col\"]\n", + "\n", + " # if metacol is not None:\n", + " # out[\"metacol\"]=metacol\n", "\n", " outs.append(out)\n", " return outs\n", "\n", - "outs = []\n", "for folder in data[\"codes\"]:\n", " folder_path = folder[\"folder\"]\n", - " for files in folder[\"files\"]:\n", + " for file in folder[\"files\"]:\n", "\n", " #TODO: actions divide_col\n", - " #TODO: save metadata - has to be dict not list?\n", "\n", " #Columns\n", " col_out = {}\n", - " for k,v in files[\"columns\"].items():\n", - " supported = [\"read2\"]\n", + " for k,v in file[\"columns\"].items():\n", + " supported = [\"read2\", \"read3\", \"icd10\", \"snomed\", \"opcs4\", \"atc\"]\n", " if type(v) == str and k[:-5] in supported:\n", " col_out[k[:-5]] = v\n", "\n", - " #Metadata\n", - " # if \"metadata\" in files[\"columns\"]:\n", - " # meta = dict(files[\"columns\"][\"metadata\"])\n", + " #Metacolumn\n", + " # if \"metadata\" in file[\"columns\"]:\n", + " # meta = dict(file[\"columns\"][\"metadata\"])\n", " # else:\n", " # meta = None\n", "\n", " #File Path\n", - " path = folder[\"folder\"][6:]+\"/\"+files[\"file\"]\n", + " new_folder_path = Path(folder[\"folder\"][6:].replace('\\\\','/'))\n", + " new_file_path = Path(file[\"file\"])\n", + " path = Path(new_folder_path / new_file_path)\n", + " \n", + " #Convert XLSX to CSV File\n", + " if \"excel_sheet\" in file.keys():\n", + " # print(\"Converted Excel\", path)\n", + " df_xlsx = pd.read_excel(Path(source_folder_path / path), sheet_name=file[\"excel_sheet\"])\n", + " save_path = Path(source_folder_path / path).with_suffix(\".csv\")\n", + " path = Path(path).with_suffix(\".csv\")\n", + " # df_xlsx.to_csv(save_path) #TODO: uncomment\n", "\n", - " if \"actions\" in files.keys():\n", - " pass\n", - " #split_col\n", - " # if \n", + " if \"actions\" in file.keys():\n", " #divide_col\n", - " # elif \"concept_set_categories\" in files:\n", - " # for cat, name in files[\"concept_set_categories\"].items():\n", - " # print(col_out)\n", - " # outs = add_conc(\n", - " # outs,\n", - " # name = name,\n", - " # category = cat,\n", - " # path=path,\n", - " # columns = {\"read2\":\"Read Code\"}, #TODO: fix bodged\n", - " # metadata = {}\n", - " # )\n", - " elif \"excel_sheet\" in files.keys():\n", - " #Convert XLSX to CSV File\n", - " print(\"Converted Excel\", path)\n", - " df_xlsx = pd.read_excel(source_folder_path+\"/\"+path, sheet_name=files[\"excel_sheet\"])\n", - " path = Path(source_folder_path+\"/\"+path).with_suffix(\".csv\")\n", - " df_xlsx.to_csv(path)\n", + " if \"concept_set_categories\" in file:\n", + " for cat, name in file[\"concept_set_categories\"].items():\n", + " outs = add_conc(\n", + " outs,\n", + " name = name[0],\n", + " category = cat,\n", + " actions = file[\"actions\"],\n", + " path=path,\n", + " columns = col_out, #TODO: fix bodged\n", + " # metacol = meta\n", + " )\n", + " #split_col\n", + " else:\n", + " for name in file[\"concept_set\"]: #If belongs to multiple\n", + " outs = add_conc(\n", + " outs,\n", + " name=str(name),\n", + " path=path,\n", + " columns = col_out,\n", + " actions=file[\"actions\"],\n", + " # metacol = meta\n", + " ) \n", "\n", + " elif \"concept_set\" in file:\n", " #Add multiple concept sets to yaml\n", - " for name in files[\"concept_set\"]: #If belongs to multiple\n", + " for name in file[\"concept_set\"]: #If belongs to multiple\n", " outs = add_conc(\n", " outs,\n", " name=str(name),\n", " path=path,\n", " columns = col_out,\n", - " metadata = {},\n", - " # metadata = meta\n", + " # metacol = meta\n", " )\n", "\n", - " elif \"concept_set\" in files:\n", - " #Add multiple concept sets to yaml\n", - " for name in files[\"concept_set\"]: #If belongs to multiple\n", - " outs = add_conc(\n", - " outs,\n", - " name=str(name),\n", - " path=path,\n", - " columns = col_out,\n", - " metadata = {},\n", - " # metadata = meta\n", - " )\n", + "outs = pd.DataFrame(outs)\n", + "display(outs)\n", + "# print(len(outs.groupby(\"name\")), \"have files, out of\", len(data[\"concept_sets\"][\"concept_set\"]), \"defined\")\n", + "\n", + "final_out = []\n", + "for name, grp in outs.groupby(\"name\"):\n", + " out = {}\n", + " out[\"name\"]=name\n", + " \n", + " out[\"files\"]=list(grp[\"files\"] )\n", + " \n", + " for conc in data[\"concept_sets\"][\"concept_set\"]:\n", + " if conc[\"concept_set_name\"] == name:\n", + " metadata=conc[\"metadata\"]\n", + " break\n", + " out[\"metadata\"]=dict(metadata)\n", + " final_out.append(out)\n", + "\n", + "print(len(final_out), \"in yaml\")\n", + "\n", + "#Add Metadata for each concept set\n", + "# for conc in data[\"concept_sets\"][\"concept_set\"]: #iterate concept set definitions\n", + "# conc_name = conc[\"concept_set_name\"]\n", + "# metadata = conc[\"metadata\"]\n", + "\n", + "# #Look for matching concept set in output \n", + "# for c in outs:\n", + "# if c[\"name\"] == conc_name:\n", + "# c[\"metadata\"] = dict(metadata) #append metadata\n", + "\n", "\n", "#Remove \"PLASMACELL\" concept set\n", - "outs = [(o) for o in outs if o[\"name\"] != \"PLASMACELL\"]\n", + "# outs = [(o) for o in outs if o[\"name\"] != \"PLASMACELL\"]\n", "\n", "final = {\n", " \"phenotype\":{\n", @@ -128,7 +337,7 @@ " \"vocabulary_reference\": \"https://www.it-innovation.soton.ac.uk/projects/meldb\",\n", " },\n", " \"map\":[\"read2\", \"read3\", \"icd10\", \"snomed\", \"opcs4\", \"atc\"],\n", - " \"concept_sets\":outs,\n", + " \"concept_sets\":final_out,\n", " },\n", "}\n", "\n", @@ -142,7 +351,7 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -156,9 +365,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.9" + "version": "3.12.4" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } -- GitLab