diff --git a/convert.ipynb b/convert.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..8339fc4b155909eb50ccbe8e831acd63f3865cf4 --- /dev/null +++ b/convert.ipynb @@ -0,0 +1,142 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import yaml\n", + "import json\n", + "from pathlib import Path\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "json_file = \"PHEN_assign_v3.json\"\n", + "yaml_path = \"config.yml\"\n", + "outs = {}\n", + "\n", + "# Read the JSON file\n", + "with open(json_file, 'r', encoding='utf-8') as file:\n", + " data = json.load(file)\n", + "\n", + "def add_conc(outs, name, path, columns, category=None, metadata=None):\n", + " if name == \"PLASMACELL\":\n", + " return outs\n", + " \n", + " out = {\n", + " \"name\":str(name),\n", + " \"file\":{\n", + " \"path\":str(path),\n", + " \"columns\":columns,\n", + " },\n", + " }\n", + " if category is not None:\n", + " out[\"file\"][\"category\"]=str(category)\n", + " if metadata is not None:\n", + " out[\"metadata\"]=metadata\n", + "\n", + " outs.append(out)\n", + " return outs\n", + "\n", + "outs = []\n", + "for folder in data[\"codes\"]:\n", + " folder_path = folder[\"folder\"]\n", + " for files in folder[\"files\"]:\n", + "\n", + " #TODO: actions divide_col\n", + " #TODO: save metadata - has to be dict not list?\n", + "\n", + " #Columns\n", + " col_out = {}\n", + " for k,v in files[\"columns\"].items():\n", + " supported = [\"read2\"]\n", + " if type(v) == str and k[:-5] in supported:\n", + " col_out[k[:-5]] = v\n", + "\n", + " #Metadata\n", + " # if \"metadata\" in files[\"columns\"]:\n", + " # meta = dict(files[\"columns\"][\"metadata\"])\n", + " # else:\n", + " # meta = None\n", + "\n", + " #File Path\n", + " path = folder[\"folder\"][6:]+\"/\"+files[\"file\"]\n", + "\n", + " if \"actions\" in files.keys():\n", + " pass\n", + " #split_col\n", + " # if \n", + " #divide_col\n", + " # elif \"concept_set_categories\" in files:\n", + " # for cat, name in files[\"concept_set_categories\"].items():\n", + " # print(col_out)\n", + " # outs = add_conc(\n", + " # outs,\n", + " # name = name,\n", + " # category = cat,\n", + " # path=path,\n", + " # columns = {\"read2\":\"Read Code\"}, #TODO: fix bodged\n", + " # metadata = {}\n", + " # )\n", + " elif \"excel_sheet\" in files.keys():\n", + " pass\n", + " elif \"concept_set\" in files:\n", + " for name in files[\"concept_set\"]: #If belongs to multiple\n", + " outs = add_conc(\n", + " outs,\n", + " name=str(name),\n", + " path=path,\n", + " columns = col_out,\n", + " metadata = {},\n", + " # metadata = meta\n", + " )\n", + "\n", + "final = {\n", + " \"phenotype\":{\n", + " \"version\": \"4.0.0\",\n", + " \"omop\":{\n", + " \"vocabulary_id\": \"MELDB_SAIL\",\n", + " \"vocabulary_name\": \"Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity\",\n", + " \"vocabulary_reference\": \"https://www.it-innovation.soton.ac.uk/projects/meldb\",\n", + " },\n", + " \"map\":[\"read2\", \"read3\", \"icd10\", \"snomed\", \"opcs4\", \"atc\"],\n", + " \"concept_sets\":outs,\n", + " },\n", + "}\n", + "\n", + "yaml.Dumper.ignore_aliases = lambda *args : True #remove unwanted pointers\n", + "\n", + "# Convert and write to YAML\n", + "with open(yaml_path, 'w', encoding='utf-8') as file:\n", + " yaml.dump(dict(final), file, default_flow_style=False, allow_unicode=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}