Convertion script - allow multiple files per concept set

4f78ba85 · Jakub Dylag · 62e98fc3 · 4f78ba85
Commit 4f78ba85 authored 3 months ago by Jakub Dylag
--- a/convert.ipynb
+++ b/convert.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -14,110 +14,319 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 37,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n",
+      "split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n",
+      "split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n",
+      "split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n",
+      "split_col {'split_col': 'coding_system', 'codes_col': 'code'}\n",
+      "divide_col 13 {'divide_col': 'MMCode'}\n",
+      "divide_col 22 {'divide_col': 'MMCode'}\n",
+      "divide_col 5 {'divide_col': 'MMCode'}\n",
+      "divide_col 33 {'divide_col': 'MMCode'}\n",
+      "divide_col 37 {'divide_col': 'MMCode'}\n",
+      "divide_col 41 {'divide_col': 'MMCode'}\n",
+      "divide_col 34 {'divide_col': 'MMCode'}\n",
+      "divide_col 12 {'divide_col': 'MMCode'}\n",
+      "divide_col 6 {'divide_col': 'MMCode'}\n",
+      "divide_col 11 {'divide_col': 'MMCode'}\n",
+      "divide_col 28 {'divide_col': 'MMCode'}\n",
+      "divide_col 3 {'divide_col': 'MMCode'}\n",
+      "divide_col 21 {'divide_col': 'MMCode'}\n",
+      "divide_col 16 {'divide_col': 'MMCode'}\n",
+      "divide_col 17 {'divide_col': 'MMCode'}\n",
+      "divide_col 36 {'divide_col': 'MMCode'}\n",
+      "divide_col 27 {'divide_col': 'MMCode'}\n",
+      "divide_col 26 {'divide_col': 'MMCode'}\n",
+      "divide_col 24 {'divide_col': 'MMCode'}\n",
+      "divide_col 2 {'divide_col': 'MMCode'}\n",
+      "divide_col 31 {'divide_col': 'MMCode'}\n",
+      "divide_col 14 {'divide_col': 'MMCode'}\n",
+      "divide_col 35 {'divide_col': 'MMCode'}\n",
+      "divide_col 39 {'divide_col': 'MMCode'}\n",
+      "divide_col 38 {'divide_col': 'MMCode'}\n",
+      "divide_col 25 {'divide_col': 'MMCode'}\n",
+      "divide_col 23 {'divide_col': 'MMCode'}\n",
+      "divide_col 19 {'divide_col': 'MMCode'}\n",
+      "divide_col 40 {'divide_col': 'MMCode'}\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>files</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ALL_MEDICATIONS</td>\n",
+       "      <td>{'path': 'Medication code source/WP02_SAIL_WIL...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>PAIN_MEDICATIONS_IF_NO_EPILEPSY_DIAGNOSIS</td>\n",
+       "      <td>{'path': 'Medication code source/Pain medicati...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>PAIN_MEDICATIONS</td>\n",
+       "      <td>{'path': 'Medication code source/Pain medicati...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>ABDO_PAIN</td>\n",
+       "      <td>{'path': 'ClinicalCodes.org from the Universit...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>CVD_EVENTS</td>\n",
+       "      <td>{'path': 'ClinicalCodes.org from the Universit...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>481</th>\n",
+       "      <td>SLEEP_PROBLEMS</td>\n",
+       "      <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/SLEEP_...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>482</th>\n",
+       "      <td>SWEATING</td>\n",
+       "      <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/SWEATI...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>483</th>\n",
+       "      <td>TIREDNESS</td>\n",
+       "      <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/TIREDN...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>484</th>\n",
+       "      <td>UNINTENTIONAL_WEIGHT_LOSS</td>\n",
+       "      <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/UNINTE...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>485</th>\n",
+       "      <td>URINARY_INCONTINENCE</td>\n",
+       "      <td>{'path': 'NEW BURDEN CODELISTS/Symptoms/URINAR...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>486 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                          name  \\\n",
+       "0                              ALL_MEDICATIONS   \n",
+       "1    PAIN_MEDICATIONS_IF_NO_EPILEPSY_DIAGNOSIS   \n",
+       "2                             PAIN_MEDICATIONS   \n",
+       "3                                    ABDO_PAIN   \n",
+       "4                                   CVD_EVENTS   \n",
+       "..                                         ...   \n",
+       "481                             SLEEP_PROBLEMS   \n",
+       "482                                   SWEATING   \n",
+       "483                                  TIREDNESS   \n",
+       "484                  UNINTENTIONAL_WEIGHT_LOSS   \n",
+       "485                       URINARY_INCONTINENCE   \n",
+       "\n",
+       "                                                 files  \n",
+       "0    {'path': 'Medication code source/WP02_SAIL_WIL...  \n",
+       "1    {'path': 'Medication code source/Pain medicati...  \n",
+       "2    {'path': 'Medication code source/Pain medicati...  \n",
+       "3    {'path': 'ClinicalCodes.org from the Universit...  \n",
+       "4    {'path': 'ClinicalCodes.org from the Universit...  \n",
+       "..                                                 ...  \n",
+       "481  {'path': 'NEW BURDEN CODELISTS/Symptoms/SLEEP_...  \n",
+       "482  {'path': 'NEW BURDEN CODELISTS/Symptoms/SWEATI...  \n",
+       "483  {'path': 'NEW BURDEN CODELISTS/Symptoms/TIREDN...  \n",
+       "484  {'path': 'NEW BURDEN CODELISTS/Symptoms/UNINTE...  \n",
+       "485  {'path': 'NEW BURDEN CODELISTS/Symptoms/URINAR...  \n",
+       "\n",
+       "[486 rows x 2 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "165 in yaml\n"
+     ]
+    }
+   ],
   "source": [
    "json_file = \"PHEN_assign_v3.json\"\n",
-    "yaml_path = \"workspace/phen/config.yml\"\n",
+    "yaml_path = \"config.yml\"\n",
-    "source_folder_path = \"workspace/phen/concepts\"\n",
+    "source_folder_path = \"concepts\"\n",
-    "outs = {}\n",
+    "outs = []\n",
    "\n",
    "# Read the JSON file\n",
    "with open(json_file, 'r', encoding='utf-8') as file:\n",
    "    data = json.load(file)\n",
    "\n",
-    "def add_conc(outs, name, path, columns, category=None, metadata=None):\n",
+    "def add_conc(outs, name, path, columns, category=None, actions=None, #metacol=None\n",
+    "):\n",
+    "    #TODO: acmc handle empty conceptset when all QA fail  \n",
    "    if name == \"PLASMACELL\":\n",
    "        return outs\n",
    "\n",
    "    out = {\n",
    "        \"name\":str(name),\n",
-    "        \"file\":{\n",
+    "        \"files\":{\n",
-    "            \"path\":str(path),\n",
+    "            \"path\":str(path).replace(\"\\\\\", '/'),\n",
    "            \"columns\":columns,\n",
    "        },\n",
    "    }\n",
-    "    if category is not None:\n",
+    "    #divide_col\n",
-    "        out[\"file\"][\"category\"]=str(category)\n",
+    "    if (category is not None) and (actions is not None):\n",
-    "    if metadata is not None:\n",
+    "        print(\"divide_col\", category, actions)\n",
-    "        out[\"metadata\"]=metadata\n",
+    "        out[\"files\"][\"category\"]=str(category)\n",
+    "        out[\"files\"][\"actions\"] = {}\n",
+    "        out[\"files\"][\"actions\"][\"divide_col\"] = actions[\"divide_col\"]\n",
+    "    #split_col\n",
+    "    elif (actions is not None):\n",
+    "        print(\"split_col\", actions)\n",
+    "        out[\"files\"][\"actions\"] = {}\n",
+    "        out[\"files\"][\"actions\"][\"split_col\"] = actions[\"split_col\"]\n",
+    "        out[\"files\"][\"actions\"][\"codes_col\"] = actions[\"codes_col\"]\n",
+    "\n",
+    "    # if metacol is not None:\n",
+    "    #     out[\"metacol\"]=metacol\n",
    "\n",
    "    outs.append(out)\n",
    "    return outs\n",
    "\n",
-    "outs = []\n",
    "for folder in data[\"codes\"]:\n",
    "    folder_path = folder[\"folder\"]\n",
-    "    for files in folder[\"files\"]:\n",
+    "    for file in folder[\"files\"]:\n",
    "\n",
    "        #TODO: actions divide_col\n",
-    "        #TODO: save metadata - has to be dict not list?\n",
    "\n",
    "        #Columns\n",
    "        col_out = {}\n",
-    "        for k,v in files[\"columns\"].items():\n",
+    "        for k,v in file[\"columns\"].items():\n",
-    "            supported = [\"read2\"]\n",
+    "            supported = [\"read2\", \"read3\", \"icd10\", \"snomed\", \"opcs4\", \"atc\"]\n",
    "            if type(v) == str and k[:-5] in supported:\n",
    "                col_out[k[:-5]] = v\n",
    "\n",
-    "        #Metadata\n",
+    "        #Metacolumn\n",
-    "        # if \"metadata\" in files[\"columns\"]:\n",
+    "        # if \"metadata\" in file[\"columns\"]:\n",
-    "        #     meta = dict(files[\"columns\"][\"metadata\"])\n",
+    "        #     meta = dict(file[\"columns\"][\"metadata\"])\n",
    "        # else:\n",
    "        #     meta = None\n",
    "\n",
    "        #File Path\n",
-    "        path = folder[\"folder\"][6:]+\"/\"+files[\"file\"]\n",
+    "        new_folder_path = Path(folder[\"folder\"][6:].replace('\\\\','/'))\n",
+    "        new_file_path = Path(file[\"file\"])\n",
+    "        path = Path(new_folder_path / new_file_path)\n",
    "        \n",
-    "        if \"actions\" in files.keys():\n",
-    "            pass\n",
-    "            #split_col\n",
-    "            # if \n",
-    "            #divide_col\n",
-    "            # elif \"concept_set_categories\" in files:\n",
-    "            #     for cat, name in files[\"concept_set_categories\"].items():\n",
-    "            #         print(col_out)\n",
-    "            #         outs = add_conc(\n",
-    "            #             outs,\n",
-    "            #             name = name,\n",
-    "            #             category = cat,\n",
-    "            #             path=path,\n",
-    "            #             columns = {\"read2\":\"Read Code\"}, #TODO: fix bodged\n",
-    "            #             metadata = {}\n",
-    "            #         )\n",
-    "        elif \"excel_sheet\" in files.keys():\n",
    "        #Convert XLSX to CSV File\n",
-    "            print(\"Converted Excel\", path)\n",
+    "        if \"excel_sheet\" in file.keys():\n",
-    "            df_xlsx = pd.read_excel(source_folder_path+\"/\"+path, sheet_name=files[\"excel_sheet\"])\n",
+    "            # print(\"Converted Excel\", path)\n",
-    "            path = Path(source_folder_path+\"/\"+path).with_suffix(\".csv\")\n",
+    "            df_xlsx = pd.read_excel(Path(source_folder_path / path), sheet_name=file[\"excel_sheet\"])\n",
-    "            df_xlsx.to_csv(path)\n",
+    "            save_path = Path(source_folder_path / path).with_suffix(\".csv\")\n",
+    "            path = Path(path).with_suffix(\".csv\")\n",
+    "            # df_xlsx.to_csv(save_path) #TODO: uncomment\n",
    "\n",
-    "            #Add multiple concept sets to yaml\n",
+    "        if \"actions\" in file.keys():\n",
-    "            for name in files[\"concept_set\"]: #If belongs to multiple\n",
+    "            #divide_col\n",
+    "            if \"concept_set_categories\" in file:\n",
+    "                for cat, name in file[\"concept_set_categories\"].items():\n",
+    "                    outs = add_conc(\n",
+    "                        outs,\n",
+    "                        name = name[0],\n",
+    "                        category = cat,\n",
+    "                        actions = file[\"actions\"],\n",
+    "                        path=path,\n",
+    "                        columns = col_out, #TODO: fix bodged\n",
+    "                        # metacol = meta\n",
+    "                    )\n",
+    "            #split_col\n",
+    "            else:\n",
+    "                for name in file[\"concept_set\"]: #If belongs to multiple\n",
    "                    outs = add_conc(\n",
    "                        outs,\n",
    "                        name=str(name),\n",
    "                        path=path,\n",
    "                        columns = col_out,\n",
-    "                    metadata = {},\n",
+    "                        actions=file[\"actions\"],\n",
-    "                    # metadata = meta\n",
+    "                        # metacol = meta\n",
    "                    ) \n",
    "\n",
-    "        elif \"concept_set\" in files:\n",
+    "        elif \"concept_set\" in file:\n",
    "            #Add multiple concept sets to yaml\n",
-    "            for name in files[\"concept_set\"]: #If belongs to multiple\n",
+    "            for name in file[\"concept_set\"]: #If belongs to multiple\n",
    "                outs = add_conc(\n",
    "                    outs,\n",
    "                    name=str(name),\n",
    "                    path=path,\n",
    "                    columns = col_out,\n",
-    "                    metadata = {},\n",
+    "                    # metacol = meta\n",
-    "                    # metadata = meta\n",
    "                )\n",
    "\n",
+    "outs = pd.DataFrame(outs)\n",
+    "display(outs)\n",
+    "# print(len(outs.groupby(\"name\")), \"have files, out of\", len(data[\"concept_sets\"][\"concept_set\"]), \"defined\")\n",
+    "\n",
+    "final_out = []\n",
+    "for name, grp in outs.groupby(\"name\"):\n",
+    "    out = {}\n",
+    "    out[\"name\"]=name\n",
+    "    \n",
+    "    out[\"files\"]=list(grp[\"files\"] )\n",
+    "    \n",
+    "    for conc in data[\"concept_sets\"][\"concept_set\"]:\n",
+    "        if conc[\"concept_set_name\"] == name:\n",
+    "            metadata=conc[\"metadata\"]\n",
+    "            break\n",
+    "    out[\"metadata\"]=dict(metadata)\n",
+    "    final_out.append(out)\n",
+    "\n",
+    "print(len(final_out), \"in yaml\")\n",
+    "\n",
+    "#Add Metadata for each concept set\n",
+    "# for conc in data[\"concept_sets\"][\"concept_set\"]: #iterate concept set definitions\n",
+    "#     conc_name = conc[\"concept_set_name\"]\n",
+    "#     metadata = conc[\"metadata\"]\n",
+    "\n",
+    "#     #Look for matching concept set in output \n",
+    "#     for c in outs:\n",
+    "#         if c[\"name\"] == conc_name:\n",
+    "#             c[\"metadata\"] = dict(metadata) #append metadata\n",
+    "\n",
+    "\n",
    "#Remove \"PLASMACELL\" concept set\n",
-    "outs = [(o) for o in outs if o[\"name\"] != \"PLASMACELL\"]\n",
+    "# outs = [(o) for o in outs if o[\"name\"] != \"PLASMACELL\"]\n",
    "\n",
    "final = {\n",
    "    \"phenotype\":{\n",
@@ -128,7 +337,7 @@
    "            \"vocabulary_reference\": \"https://www.it-innovation.soton.ac.uk/projects/meldb\",\n",
    "        },\n",
    "        \"map\":[\"read2\", \"read3\", \"icd10\", \"snomed\", \"opcs4\", \"atc\"],\n",
-    "        \"concept_sets\":outs,\n",
+    "        \"concept_sets\":final_out,\n",
    "    },\n",
    "}\n",
    "\n",
@@ -142,7 +351,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@@ -156,9 +365,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.12.9"
+   "version": "3.12.4"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
 %% Cell type:code id: tags:
 ``` python
 import yaml
 import json
 from pathlib import Path
 import pandas as pd
 ```
 %% Cell type:code id: tags:
 ``` python
 json_file = "PHEN_assign_v3.json"
-yaml_path = "workspace/phen/config.yml"
+yaml_path = "config.yml"
-source_folder_path = "workspace/phen/concepts"
+source_folder_path = "concepts"
-outs = {}
+outs = []
 # Read the JSON file
 with open(json_file, 'r', encoding='utf-8') as file:
    data = json.load(file)
-def add_conc(outs, name, path, columns, category=None, metadata=None):
+def add_conc(outs, name, path, columns, category=None, actions=None, #metacol=None
+):
+    #TODO: acmc handle empty conceptset when all QA fail
    if name == "PLASMACELL":
        return outs
    out = {
        "name":str(name),
-        "file":{
+        "files":{
-            "path":str(path),
+            "path":str(path).replace("\\", '/'),
            "columns":columns,
        },
    }
-    if category is not None:
+    #divide_col
-        out["file"]["category"]=str(category)
+    if (category is not None) and (actions is not None):
-    if metadata is not None:
+        print("divide_col", category, actions)
-        out["metadata"]=metadata
+        out["files"]["category"]=str(category)
+        out["files"]["actions"] = {}
+        out["files"]["actions"]["divide_col"] = actions["divide_col"]
+    #split_col
+    elif (actions is not None):
+        print("split_col", actions)
+        out["files"]["actions"] = {}
+        out["files"]["actions"]["split_col"] = actions["split_col"]
+        out["files"]["actions"]["codes_col"] = actions["codes_col"]
+    # if metacol is not None:
+    #     out["metacol"]=metacol
    outs.append(out)
    return outs
-outs = []
 for folder in data["codes"]:
    folder_path = folder["folder"]
-    for files in folder["files"]:
+    for file in folder["files"]:
        #TODO: actions divide_col
-        #TODO: save metadata - has to be dict not list?
        #Columns
        col_out = {}
-        for k,v in files["columns"].items():
+        for k,v in file["columns"].items():
-            supported = ["read2"]
+            supported = ["read2", "read3", "icd10", "snomed", "opcs4", "atc"]
            if type(v) == str and k[:-5] in supported:
                col_out[k[:-5]] = v
-        #Metadata
+        #Metacolumn
-        # if "metadata" in files["columns"]:
+        # if "metadata" in file["columns"]:
-        #     meta = dict(files["columns"]["metadata"])
+        #     meta = dict(file["columns"]["metadata"])
        # else:
        #     meta = None
        #File Path
-        path = folder["folder"][6:]+"/"+files["file"]
+        new_folder_path = Path(folder["folder"][6:].replace('\\','/'))
+        new_file_path = Path(file["file"])
+        path = Path(new_folder_path / new_file_path)
+        #Convert XLSX to CSV File
+        if "excel_sheet" in file.keys():
+            # print("Converted Excel", path)
+            df_xlsx = pd.read_excel(Path(source_folder_path / path), sheet_name=file["excel_sheet"])
+            save_path = Path(source_folder_path / path).with_suffix(".csv")
+            path = Path(path).with_suffix(".csv")
+            # df_xlsx.to_csv(save_path) #TODO: uncomment
-        if "actions" in files.keys():
+        if "actions" in file.keys():
-            pass
-            #split_col
-            # if
            #divide_col
-            # elif "concept_set_categories" in files:
+            if "concept_set_categories" in file:
-            #     for cat, name in files["concept_set_categories"].items():
+                for cat, name in file["concept_set_categories"].items():
-            #         print(col_out)
+                    outs = add_conc(
-            #         outs = add_conc(
+                        outs,
-            #             outs,
+                        name = name[0],
-            #             name = name,
+                        category = cat,
-            #             category = cat,
+                        actions = file["actions"],
-            #             path=path,
+                        path=path,
-            #             columns = {"read2":"Read Code"}, #TODO: fix bodged
+                        columns = col_out, #TODO: fix bodged
-            #             metadata = {}
+                        # metacol = meta
-            #         )
+                    )
-        elif "excel_sheet" in files.keys():
+            #split_col
-            #Convert XLSX to CSV File
+            else:
-            print("Converted Excel", path)
+                for name in file["concept_set"]: #If belongs to multiple
-            df_xlsx = pd.read_excel(source_folder_path+"/"+path, sheet_name=files["excel_sheet"])
+                    outs = add_conc(
-            path = Path(source_folder_path+"/"+path).with_suffix(".csv")
+                        outs,
-            df_xlsx.to_csv(path)
+                        name=str(name),
+                        path=path,
+                        columns = col_out,
+                        actions=file["actions"],
+                        # metacol = meta
+                    )
+        elif "concept_set" in file:
            #Add multiple concept sets to yaml
-            for name in files["concept_set"]: #If belongs to multiple
+            for name in file["concept_set"]: #If belongs to multiple
                outs = add_conc(
                    outs,
                    name=str(name),
                    path=path,
                    columns = col_out,
-                    metadata = {},
+                    # metacol = meta
-                    # metadata = meta
                )
-        elif "concept_set" in files:
+outs = pd.DataFrame(outs)
-            #Add multiple concept sets to yaml
+display(outs)
-            for name in files["concept_set"]: #If belongs to multiple
+# print(len(outs.groupby("name")), "have files, out of", len(data["concept_sets"]["concept_set"]), "defined")
-                outs = add_conc(
-                    outs,
+final_out = []
-                    name=str(name),
+for name, grp in outs.groupby("name"):
-                    path=path,
+    out = {}
-                    columns = col_out,
+    out["name"]=name
-                    metadata = {},
-                    # metadata = meta
+    out["files"]=list(grp["files"] )
-                )
+    for conc in data["concept_sets"]["concept_set"]:
+        if conc["concept_set_name"] == name:
+            metadata=conc["metadata"]
+            break
+    out["metadata"]=dict(metadata)
+    final_out.append(out)
+print(len(final_out), "in yaml")
+#Add Metadata for each concept set
+# for conc in data["concept_sets"]["concept_set"]: #iterate concept set definitions
+#     conc_name = conc["concept_set_name"]
+#     metadata = conc["metadata"]
+#     #Look for matching concept set in output
+#     for c in outs:
+#         if c["name"] == conc_name:
+#             c["metadata"] = dict(metadata) #append metadata
 #Remove "PLASMACELL" concept set
-outs = [(o) for o in outs if o["name"] != "PLASMACELL"]
+# outs = [(o) for o in outs if o["name"] != "PLASMACELL"]
 final = {
    "phenotype":{
        "version": "4.0.0",
        "omop":{
            "vocabulary_id": "MELDB_SAIL",
            "vocabulary_name": "Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity",
            "vocabulary_reference": "https://www.it-innovation.soton.ac.uk/projects/meldb",
        },
        "map":["read2", "read3", "icd10", "snomed", "opcs4", "atc"],
-        "concept_sets":outs,
+        "concept_sets":final_out,
    },
 }
 yaml.Dumper.ignore_aliases = lambda *args : True #remove unwanted pointers
 # Convert and write to YAML
 with open(yaml_path, 'w', encoding='utf-8') as file:
    yaml.dump(dict(final), file, default_flow_style=False, allow_unicode=True)
 ```
+%% Output
+    split_col {'split_col': 'coding_system', 'codes_col': 'code'}
+    split_col {'split_col': 'coding_system', 'codes_col': 'code'}
+    split_col {'split_col': 'coding_system', 'codes_col': 'code'}
+    split_col {'split_col': 'coding_system', 'codes_col': 'code'}
+    split_col {'split_col': 'coding_system', 'codes_col': 'code'}
+    divide_col 13 {'divide_col': 'MMCode'}
+    divide_col 22 {'divide_col': 'MMCode'}
+    divide_col 5 {'divide_col': 'MMCode'}
+    divide_col 33 {'divide_col': 'MMCode'}
+    divide_col 37 {'divide_col': 'MMCode'}
+    divide_col 41 {'divide_col': 'MMCode'}
+    divide_col 34 {'divide_col': 'MMCode'}
+    divide_col 12 {'divide_col': 'MMCode'}
+    divide_col 6 {'divide_col': 'MMCode'}
+    divide_col 11 {'divide_col': 'MMCode'}
+    divide_col 28 {'divide_col': 'MMCode'}
+    divide_col 3 {'divide_col': 'MMCode'}
+    divide_col 21 {'divide_col': 'MMCode'}
+    divide_col 16 {'divide_col': 'MMCode'}
+    divide_col 17 {'divide_col': 'MMCode'}
+    divide_col 36 {'divide_col': 'MMCode'}
+    divide_col 27 {'divide_col': 'MMCode'}
+    divide_col 26 {'divide_col': 'MMCode'}
+    divide_col 24 {'divide_col': 'MMCode'}
+    divide_col 2 {'divide_col': 'MMCode'}
+    divide_col 31 {'divide_col': 'MMCode'}
+    divide_col 14 {'divide_col': 'MMCode'}
+    divide_col 35 {'divide_col': 'MMCode'}
+    divide_col 39 {'divide_col': 'MMCode'}
+    divide_col 38 {'divide_col': 'MMCode'}
+    divide_col 25 {'divide_col': 'MMCode'}
+    divide_col 23 {'divide_col': 'MMCode'}
+    divide_col 19 {'divide_col': 'MMCode'}
+    divide_col 40 {'divide_col': 'MMCode'}
+    165 in yaml