From 4f78ba8595a9de3eb5ca296dd87237ecfa503f15 Mon Sep 17 00:00:00 2001
From: Jakub Dylag <jjd1c23@soton.ac.uk>
Date: Fri, 4 Apr 2025 10:58:55 +0100
Subject: [PATCH] Convertion script - allow multiple files per concept set

---
 convert.ipynb | 335 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 272 insertions(+), 63 deletions(-)

diff --git a/convert.ipynb b/convert.ipynb
index a600daf..a136f4b 100644
--- a/convert.ipynb
+++ b/convert.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -14,110 +14,319 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 37,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n",
+      "split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n",
+      "split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n",
+      "split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n",
+      "split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n",
+      "divide_col 13 {'divide_col': 'MMCode'}\n",
+      "divide_col 22 {'divide_col': 'MMCode'}\n",
+      "divide_col 5 {'divide_col': 'MMCode'}\n",
+      "divide_col 33 {'divide_col': 'MMCode'}\n",
+      "divide_col 37 {'divide_col': 'MMCode'}\n",
+      "divide_col 41 {'divide_col': 'MMCode'}\n",
+      "divide_col 34 {'divide_col': 'MMCode'}\n",
+      "divide_col 12 {'divide_col': 'MMCode'}\n",
+      "divide_col 6 {'divide_col': 'MMCode'}\n",
+      "divide_col 11 {'divide_col': 'MMCode'}\n",
+      "divide_col 28 {'divide_col': 'MMCode'}\n",
+      "divide_col 3 {'divide_col': 'MMCode'}\n",
+      "divide_col 21 {'divide_col': 'MMCode'}\n",
+      "divide_col 16 {'divide_col': 'MMCode'}\n",
+      "divide_col 17 {'divide_col': 'MMCode'}\n",
+      "divide_col 36 {'divide_col': 'MMCode'}\n",
+      "divide_col 27 {'divide_col': 'MMCode'}\n",
+      "divide_col 26 {'divide_col': 'MMCode'}\n",
+      "divide_col 24 {'divide_col': 'MMCode'}\n",
+      "divide_col 2 {'divide_col': 'MMCode'}\n",
+      "divide_col 31 {'divide_col': 'MMCode'}\n",
+      "divide_col 14 {'divide_col': 'MMCode'}\n",
+      "divide_col 35 {'divide_col': 'MMCode'}\n",
+      "divide_col 39 {'divide_col': 'MMCode'}\n",
+      "divide_col 38 {'divide_col': 'MMCode'}\n",
+      "divide_col 25 {'divide_col': 'MMCode'}\n",
+      "divide_col 23 {'divide_col': 'MMCode'}\n",
+      "divide_col 19 {'divide_col': 'MMCode'}\n",
+      "divide_col 40 {'divide_col': 'MMCode'}\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>files</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ALL_MEDICATIONS</td>\n",
+       "      <td>{'path': 'Medication code source/WP02_SAIL_WIL...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>PAIN_MEDICATIONS_IF_NO_EPILEPSY_DIAGNOSIS</td>\n",
+       "      <td>{'path': 'Medication code source/Pain medicati...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>PAIN_MEDICATIONS</td>\n",
+       "      <td>{'path': 'Medication code source/Pain medicati...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>ABDO_PAIN</td>\n",
+       "      <td>{'path': 'ClinicalCodes.org from the Universit...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>CVD_EVENTS</td>\n",
+       "      <td>{'path': 'ClinicalCodes.org from the Universit...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>481</th>\n",
+       "      <td>SLEEP_PROBLEMS</td>\n",
+       "      <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/SLEEP_...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>482</th>\n",
+       "      <td>SWEATING</td>\n",
+       "      <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/SWEATI...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>483</th>\n",
+       "      <td>TIREDNESS</td>\n",
+       "      <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/TIREDN...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>484</th>\n",
+       "      <td>UNINTENTIONAL_WEIGHT_LOSS</td>\n",
+       "      <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/UNINTE...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>485</th>\n",
+       "      <td>URINARY_INCONTINENCE</td>\n",
+       "      <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/URINAR...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>486 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                          name  \\\n",
+       "0                              ALL_MEDICATIONS   \n",
+       "1    PAIN_MEDICATIONS_IF_NO_EPILEPSY_DIAGNOSIS   \n",
+       "2                             PAIN_MEDICATIONS   \n",
+       "3                                    ABDO_PAIN   \n",
+       "4                                   CVD_EVENTS   \n",
+       "..                                         ...   \n",
+       "481                             SLEEP_PROBLEMS   \n",
+       "482                                   SWEATING   \n",
+       "483                                  TIREDNESS   \n",
+       "484                  UNINTENTIONAL_WEIGHT_LOSS   \n",
+       "485                       URINARY_INCONTINENCE   \n",
+       "\n",
+       "                                                 files  \n",
+       "0    {'path': 'Medication code source/WP02_SAIL_WIL...  \n",
+       "1    {'path': 'Medication code source/Pain medicati...  \n",
+       "2    {'path': 'Medication code source/Pain medicati...  \n",
+       "3    {'path': 'ClinicalCodes.org from the Universit...  \n",
+       "4    {'path': 'ClinicalCodes.org from the Universit...  \n",
+       "..                                                 ...  \n",
+       "481  {'path': 'NEW BURDEN CODELISTS/Symptoms/SLEEP_...  \n",
+       "482  {'path': 'NEW BURDEN CODELISTS/Symptoms/SWEATI...  \n",
+       "483  {'path': 'NEW BURDEN CODELISTS/Symptoms/TIREDN...  \n",
+       "484  {'path': 'NEW BURDEN CODELISTS/Symptoms/UNINTE...  \n",
+       "485  {'path': 'NEW BURDEN CODELISTS/Symptoms/URINAR...  \n",
+       "\n",
+       "[486 rows x 2 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "165 in yaml\n"
+     ]
+    }
+   ],
    "source": [
     "json_file = \"PHEN_assign_v3.json\"\n",
-    "yaml_path = \"workspace/phen/config.yml\"\n",
-    "source_folder_path = \"workspace/phen/concepts\"\n",
-    "outs = {}\n",
+    "yaml_path = \"config.yml\"\n",
+    "source_folder_path = \"concepts\"\n",
+    "outs = []\n",
     "\n",
     "# Read the JSON file\n",
     "with open(json_file, 'r', encoding='utf-8') as file:\n",
     "    data = json.load(file)\n",
     "\n",
-    "def add_conc(outs, name, path, columns, category=None, metadata=None):\n",
+    "def add_conc(outs, name, path, columns, category=None, actions=None, #metacol=None\n",
+    "):\n",
+    "    #TODO: acmc handle empty conceptset when all QA fail  \n",
     "    if name == \"PLASMACELL\":\n",
     "        return outs\n",
-    "    \n",
+    "\n",
     "    out = {\n",
     "        \"name\":str(name),\n",
-    "        \"file\":{\n",
-    "            \"path\":str(path),\n",
+    "        \"files\":{\n",
+    "            \"path\":str(path).replace(\"\\\\\", '/'),\n",
     "            \"columns\":columns,\n",
     "        },\n",
     "    }\n",
-    "    if category is not None:\n",
-    "        out[\"file\"][\"category\"]=str(category)\n",
-    "    if metadata is not None:\n",
-    "        out[\"metadata\"]=metadata\n",
+    "    #divide_col\n",
+    "    if (category is not None) and (actions is not None):\n",
+    "        print(\"divide_col\", category, actions)\n",
+    "        out[\"files\"][\"category\"]=str(category)\n",
+    "        out[\"files\"][\"actions\"] = {}\n",
+    "        out[\"files\"][\"actions\"][\"divide_col\"] = actions[\"divide_col\"]\n",
+    "    #split_col\n",
+    "    elif (actions is not None):\n",
+    "        print(\"split_col\", actions)\n",
+    "        out[\"files\"][\"actions\"] = {}\n",
+    "        out[\"files\"][\"actions\"][\"split_col\"] = actions[\"split_col\"]\n",
+    "        out[\"files\"][\"actions\"][\"codes_col\"] = actions[\"codes_col\"]\n",
+    "\n",
+    "    # if metacol is not None:\n",
+    "    #     out[\"metacol\"]=metacol\n",
     "\n",
     "    outs.append(out)\n",
     "    return outs\n",
     "\n",
-    "outs = []\n",
     "for folder in data[\"codes\"]:\n",
     "    folder_path = folder[\"folder\"]\n",
-    "    for files in folder[\"files\"]:\n",
+    "    for file in folder[\"files\"]:\n",
     "\n",
     "        #TODO: actions divide_col\n",
-    "        #TODO: save metadata - has to be dict not list?\n",
     "\n",
     "        #Columns\n",
     "        col_out = {}\n",
-    "        for k,v in files[\"columns\"].items():\n",
-    "            supported = [\"read2\"]\n",
+    "        for k,v in file[\"columns\"].items():\n",
+    "            supported = [\"read2\", \"read3\", \"icd10\", \"snomed\", \"opcs4\", \"atc\"]\n",
     "            if type(v) == str and k[:-5] in supported:\n",
     "                col_out[k[:-5]] = v\n",
     "\n",
-    "        #Metadata\n",
-    "        # if \"metadata\" in files[\"columns\"]:\n",
-    "        #     meta = dict(files[\"columns\"][\"metadata\"])\n",
+    "        #Metacolumn\n",
+    "        # if \"metadata\" in file[\"columns\"]:\n",
+    "        #     meta = dict(file[\"columns\"][\"metadata\"])\n",
     "        # else:\n",
     "        #     meta = None\n",
     "\n",
     "        #File Path\n",
-    "        path = folder[\"folder\"][6:]+\"/\"+files[\"file\"]\n",
+    "        new_folder_path = Path(folder[\"folder\"][6:].replace('\\\\','/'))\n",
+    "        new_file_path = Path(file[\"file\"])\n",
+    "        path = Path(new_folder_path / new_file_path)\n",
+    "        \n",
+    "        #Convert XLSX to CSV File\n",
+    "        if \"excel_sheet\" in file.keys():\n",
+    "            # print(\"Converted Excel\", path)\n",
+    "            df_xlsx = pd.read_excel(Path(source_folder_path / path), sheet_name=file[\"excel_sheet\"])\n",
+    "            save_path = Path(source_folder_path / path).with_suffix(\".csv\")\n",
+    "            path = Path(path).with_suffix(\".csv\")\n",
+    "            # df_xlsx.to_csv(save_path) #TODO: uncomment\n",
     "\n",
-    "        if \"actions\" in files.keys():\n",
-    "            pass\n",
-    "            #split_col\n",
-    "            # if \n",
+    "        if \"actions\" in file.keys():\n",
     "            #divide_col\n",
-    "            # elif \"concept_set_categories\" in files:\n",
-    "            #     for cat, name in files[\"concept_set_categories\"].items():\n",
-    "            #         print(col_out)\n",
-    "            #         outs = add_conc(\n",
-    "            #             outs,\n",
-    "            #             name = name,\n",
-    "            #             category = cat,\n",
-    "            #             path=path,\n",
-    "            #             columns = {\"read2\":\"Read Code\"}, #TODO: fix bodged\n",
-    "            #             metadata = {}\n",
-    "            #         )\n",
-    "        elif \"excel_sheet\" in files.keys():\n",
-    "            #Convert XLSX to CSV File\n",
-    "            print(\"Converted Excel\", path)\n",
-    "            df_xlsx = pd.read_excel(source_folder_path+\"/\"+path, sheet_name=files[\"excel_sheet\"])\n",
-    "            path = Path(source_folder_path+\"/\"+path).with_suffix(\".csv\")\n",
-    "            df_xlsx.to_csv(path)\n",
+    "            if \"concept_set_categories\" in file:\n",
+    "                for cat, name in file[\"concept_set_categories\"].items():\n",
+    "                    outs = add_conc(\n",
+    "                        outs,\n",
+    "                        name = name[0],\n",
+    "                        category = cat,\n",
+    "                        actions = file[\"actions\"],\n",
+    "                        path=path,\n",
+    "                        columns = col_out, #TODO: fix bodged\n",
+    "                        # metacol = meta\n",
+    "                    )\n",
+    "            #split_col\n",
+    "            else:\n",
+    "                for name in file[\"concept_set\"]: #If belongs to multiple\n",
+    "                    outs = add_conc(\n",
+    "                        outs,\n",
+    "                        name=str(name),\n",
+    "                        path=path,\n",
+    "                        columns = col_out,\n",
+    "                        actions=file[\"actions\"],\n",
+    "                        # metacol = meta\n",
+    "                    ) \n",
     "\n",
+    "        elif \"concept_set\" in file:\n",
     "            #Add multiple concept sets to yaml\n",
-    "            for name in files[\"concept_set\"]: #If belongs to multiple\n",
+    "            for name in file[\"concept_set\"]: #If belongs to multiple\n",
     "                outs = add_conc(\n",
     "                    outs,\n",
     "                    name=str(name),\n",
     "                    path=path,\n",
     "                    columns = col_out,\n",
-    "                    metadata = {},\n",
-    "                    # metadata = meta\n",
+    "                    # metacol = meta\n",
     "                )\n",
     "\n",
-    "        elif \"concept_set\" in files:\n",
-    "            #Add multiple concept sets to yaml\n",
-    "            for name in files[\"concept_set\"]: #If belongs to multiple\n",
-    "                outs = add_conc(\n",
-    "                    outs,\n",
-    "                    name=str(name),\n",
-    "                    path=path,\n",
-    "                    columns = col_out,\n",
-    "                    metadata = {},\n",
-    "                    # metadata = meta\n",
-    "                )\n",
+    "outs = pd.DataFrame(outs)\n",
+    "display(outs)\n",
+    "# print(len(outs.groupby(\"name\")), \"have files, out of\", len(data[\"concept_sets\"][\"concept_set\"]), \"defined\")\n",
+    "\n",
+    "final_out = []\n",
+    "for name, grp in outs.groupby(\"name\"):\n",
+    "    out = {}\n",
+    "    out[\"name\"]=name\n",
+    "    \n",
+    "    out[\"files\"]=list(grp[\"files\"] )\n",
+    "    \n",
+    "    for conc in data[\"concept_sets\"][\"concept_set\"]:\n",
+    "        if conc[\"concept_set_name\"] == name:\n",
+    "            metadata=conc[\"metadata\"]\n",
+    "            break\n",
+    "    out[\"metadata\"]=dict(metadata)\n",
+    "    final_out.append(out)\n",
+    "\n",
+    "print(len(final_out), \"in yaml\")\n",
+    "\n",
+    "#Add Metadata for each concept set\n",
+    "# for conc in data[\"concept_sets\"][\"concept_set\"]: #iterate concept set definitions\n",
+    "#     conc_name = conc[\"concept_set_name\"]\n",
+    "#     metadata = conc[\"metadata\"]\n",
+    "\n",
+    "#     #Look for matching concept set in output \n",
+    "#     for c in outs:\n",
+    "#         if c[\"name\"] == conc_name:\n",
+    "#             c[\"metadata\"] = dict(metadata) #append metadata\n",
+    "\n",
     "\n",
     "#Remove \"PLASMACELL\" concept set\n",
-    "outs = [(o) for o in outs if o[\"name\"] != \"PLASMACELL\"]\n",
+    "# outs = [(o) for o in outs if o[\"name\"] != \"PLASMACELL\"]\n",
     "\n",
     "final = {\n",
     "    \"phenotype\":{\n",
@@ -128,7 +337,7 @@
     "            \"vocabulary_reference\": \"https://www.it-innovation.soton.ac.uk/projects/meldb\",\n",
     "        },\n",
     "        \"map\":[\"read2\", \"read3\", \"icd10\", \"snomed\", \"opcs4\", \"atc\"],\n",
-    "        \"concept_sets\":outs,\n",
+    "        \"concept_sets\":final_out,\n",
     "    },\n",
     "}\n",
     "\n",
@@ -142,7 +351,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -156,9 +365,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.9"
+   "version": "3.12.4"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
-- 
GitLab