Fixed wandb

49c5f1e5 · eca1g19 · 73e99f97 · 49c5f1e5
Commit 49c5f1e5 authored Jun 16, 2023 by eca1g19
--- a/main_lightning.ipynb
+++ b/main_lightning.ipynb
@@ -14,7 +14,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 43,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
@@ -38,7 +38,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Wed Jun 14 17:04:39 2023       \n",
+      "Fri Jun 16 07:18:31 2023       \n",
      "+---------------------------------------------------------------------------------------+\n",
      "| NVIDIA-SMI 531.29                 Driver Version: 531.29       CUDA Version: 12.1     |\n",
      "|-----------------------------------------+----------------------+----------------------+\n",
@@ -47,7 +47,7 @@
      "|                                         |                      |               MIG M. |\n",
      "|=========================================+======================+======================|\n",
      "|   0  NVIDIA GeForce RTX 2080 Ti    WDDM | 00000000:0E:00.0  On |                  N/A |\n",
-      "| 41%   49C    P8               43W / 260W|   2505MiB / 11264MiB |     17%      Default |\n",
+      "| 41%   45C    P8               37W / 260W|   3417MiB / 11264MiB |     23%      Default |\n",
      "|                                         |                      |                  N/A |\n",
      "+-----------------------------------------+----------------------+----------------------+\n",
      "                                                                                         \n",
@@ -56,39 +56,35 @@
      "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
      "|        ID   ID                                                             Usage      |\n",
      "|=======================================================================================|\n",
-      "|    0   N/A  N/A      2412    C+G   ...inaries\\Win64\\EpicGamesLauncher.exe    N/A      |\n",
      "|    0   N/A  N/A      3144    C+G   ...a\\Local\\Mozilla Firefox\\firefox.exe    N/A      |\n",
      "|    0   N/A  N/A      3400    C+G   ..._x64__kzf8qxf38zg5c\\Skype\\Skype.exe    N/A      |\n",
      "|    0   N/A  N/A      3752    C+G   ...GeForce Experience\\NVIDIA Share.exe    N/A      |\n",
      "|    0   N/A  N/A      4240    C+G   ...1.0_x64__8wekyb3d8bbwe\\Video.UI.exe    N/A      |\n",
      "|    0   N/A  N/A      6468    C+G   ....Search_cw5n1h2txyewy\\SearchApp.exe    N/A      |\n",
      "|    0   N/A  N/A      6828    C+G   ...rm 2020.3.3\\jbr\\bin\\jcef_helper.exe    N/A      |\n",
-      "|    0   N/A  N/A      9500    C+G   ....0_x64__8wekyb3d8bbwe\\HxOutlook.exe    N/A      |\n",
      "|    0   N/A  N/A      9780    C+G   ..._x64__kzf8qxf38zg5c\\Skype\\Skype.exe    N/A      |\n",
+      "|    0   N/A  N/A     11044    C+G   ....0_x64__8wekyb3d8bbwe\\HxOutlook.exe    N/A      |\n",
      "|    0   N/A  N/A     11628    C+G   C:\\Windows\\explorer.exe                   N/A      |\n",
+      "|    0   N/A  N/A     11652    C+G   ...61.0_x64__8wekyb3d8bbwe\\GameBar.exe    N/A      |\n",
      "|    0   N/A  N/A     12416    C+G   ...2txyewy\\StartMenuExperienceHost.exe    N/A      |\n",
+      "|    0   N/A  N/A     12892    C+G   ..._8wekyb3d8bbwe\\Microsoft.Photos.exe    N/A      |\n",
      "|    0   N/A  N/A     14040    C+G   ...302.5.0_x64__8wekyb3d8bbwe\\Time.exe    N/A      |\n",
      "|    0   N/A  N/A     14792    C+G   ...GeForce Experience\\NVIDIA Share.exe    N/A      |\n",
      "|    0   N/A  N/A     16016    C+G   ...CBS_cw5n1h2txyewy\\TextInputHost.exe    N/A      |\n",
      "|    0   N/A  N/A     16612    C+G   ...ft Office\\root\\Office16\\OUTLOOK.EXE    N/A      |\n",
      "|    0   N/A  N/A     17024    C+G   ....Search_cw5n1h2txyewy\\SearchApp.exe    N/A      |\n",
-      "|    0   N/A  N/A     17124    C+G   ...oogle\\Chrome\\Application\\chrome.exe    N/A      |\n",
      "|    0   N/A  N/A     17368    C+G   ...l\\Microsoft\\Teams\\current\\Teams.exe    N/A      |\n",
      "|    0   N/A  N/A     20412    C+G   ...on\\114.0.1823.43\\msedgewebview2.exe    N/A      |\n",
      "|    0   N/A  N/A     20660    C+G   ...air\\Corsair iCUE5 Software\\iCUE.exe    N/A      |\n",
-      "|    0   N/A  N/A     23044    C+G   ...\\cef\\cef.win7x64\\steamwebhelper.exe    N/A      |\n",
+      "|    0   N/A  N/A     23236    C+G   ...ne\\Binaries\\Win64\\EpicWebHelper.exe    N/A      |\n",
      "|    0   N/A  N/A     23360    C+G   ...Canary\\app-1.0.66\\DiscordCanary.exe    N/A      |\n",
-      "|    0   N/A  N/A     24680    C+G   ...ne\\Binaries\\Win64\\EpicWebHelper.exe    N/A      |\n",
-      "|    0   N/A  N/A     25200    C+G   ...on\\wallpaper_engine\\wallpaper32.exe    N/A      |\n",
+      "|    0   N/A  N/A     25140    C+G   ...on\\wallpaper_engine\\wallpaper32.exe    N/A      |\n",
      "|    0   N/A  N/A     25596    C+G   ...e Stream\\76.0.3.0\\GoogleDriveFS.exe    N/A      |\n",
-      "|    0   N/A  N/A     25952    C+G   ..._8wekyb3d8bbwe\\Microsoft.Photos.exe    N/A      |\n",
      "|    0   N/A  N/A     26716    C+G   C:\\Program Files\\RaiderIO\\RaiderIO.exe    N/A      |\n",
      "|    0   N/A  N/A     27700    C+G   ...les (x86)\\Overwolf\\old_Overwolf.exe    N/A      |\n",
      "|    0   N/A  N/A     28444    C+G   ...cordPTB\\app-1.0.1027\\DiscordPTB.exe    N/A      |\n",
-      "|    0   N/A  N/A     29192    C+G   ...les (x86)\\Battle.net\\Battle.net.exe    N/A      |\n",
      "|    0   N/A  N/A     31192    C+G   ...wolf\\0.223.0.33\\OverwolfBrowser.exe    N/A      |\n",
      "|    0   N/A  N/A     31576    C+G   C:\\Program Files\\NordVPN\\NordVPN.exe      N/A      |\n",
-      "|    0   N/A  N/A     31956    C+G   ...ekyb3d8bbwe\\PhoneExperienceHost.exe    N/A      |\n",
      "|    0   N/A  N/A     32976    C+G   ...ft Office\\root\\Office16\\WINWORD.EXE    N/A      |\n",
      "|    0   N/A  N/A     34400    C+G   ...02.0_x86__zpdnekdrzrea0\\Spotify.exe    N/A      |\n",
      "|    0   N/A  N/A     34932    C+G   ...ft Office\\root\\Office16\\ONENOTE.EXE    N/A      |\n",
@@ -96,13 +92,19 @@
      "|    0   N/A  N/A     37420    C+G   ...l\\Microsoft\\Teams\\current\\Teams.exe    N/A      |\n",
      "|    0   N/A  N/A     37968    C+G   ...al\\Discord\\app-1.0.9013\\Discord.exe    N/A      |\n",
      "|    0   N/A  N/A     38508    C+G   ...t.LockApp_cw5n1h2txyewy\\LockApp.exe    N/A      |\n",
+      "|    0   N/A  N/A     40684    C+G   ...inaries\\Win64\\EpicGamesLauncher.exe    N/A      |\n",
      "|    0   N/A  N/A     42416    C+G   ...ft Office\\root\\Office16\\WINWORD.EXE    N/A      |\n",
-      "|    0   N/A  N/A     42952    C+G   ...crosoft\\Edge\\Application\\msedge.exe    N/A      |\n",
+      "|    0   N/A  N/A     42904    C+G   ...oogle\\Chrome\\Application\\chrome.exe    N/A      |\n",
      "|    0   N/A  N/A     44812    C+G   ...cal\\Microsoft\\OneDrive\\OneDrive.exe    N/A      |\n",
+      "|    0   N/A  N/A     44944    C+G   ...crosoft\\Edge\\Application\\msedge.exe    N/A      |\n",
+      "|    0   N/A  N/A     46752    C+G   ...\\cef\\cef.win7x64\\steamwebhelper.exe    N/A      |\n",
      "|    0   N/A  N/A     47144    C+G   ...a\\Local\\Mozilla Firefox\\firefox.exe    N/A      |\n",
      "|    0   N/A  N/A     47776    C+G   ...siveControlPanel\\SystemSettings.exe    N/A      |\n",
-      "|    0   N/A  N/A     49192    C+G   ...0_x64__8wekyb3d8bbwe\\HxAccounts.exe    N/A      |\n",
-      "|    0   N/A  N/A     49296    C+G   ...sair iCUE5 Software\\QmlRenderer.exe    N/A      |\n",
+      "|    0   N/A  N/A     47892    C+G   ...0_x64__8wekyb3d8bbwe\\HxAccounts.exe    N/A      |\n",
+      "|    0   N/A  N/A     52648      C   ...\\uwu\\miniconda3\\envs\\uni\\python.exe    N/A      |\n",
+      "|    0   N/A  N/A     53120    C+G   ...sair iCUE5 Software\\QmlRenderer.exe    N/A      |\n",
+      "|    0   N/A  N/A     57048    C+G   ...ager\\Mendeley Reference Manager.exe    N/A      |\n",
+      "|    0   N/A  N/A     58088      C   ...\\uwu\\miniconda3\\envs\\uni\\python.exe    N/A      |\n",
      "+---------------------------------------------------------------------------------------+\n"
     ]
    }
@@ -122,7 +124,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 44,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
@@ -250,8 +252,46 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "outputs": [],
+   "execution_count": 64,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
+      "Requirement already satisfied: wandb in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (0.15.4)\n",
+      "Requirement already satisfied: docker-pycreds>=0.4.0 in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from wandb) (0.4.0)\n",
+      "Requirement already satisfied: protobuf!=4.21.0,<5,>=3.19.0 in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from wandb) (3.20.3)\n",
+      "Requirement already satisfied: GitPython!=3.1.29,>=1.0.0 in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from wandb) (3.1.31)\n",
+      "Requirement already satisfied: psutil>=5.0.0 in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from wandb) (5.9.0)\n",
+      "Requirement already satisfied: appdirs>=1.4.3 in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from wandb) (1.4.4)\n",
+      "Requirement already satisfied: pathtools in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from wandb) (0.1.2)\n",
+      "Requirement already satisfied: setproctitle in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from wandb) (1.3.2)\n",
+      "Requirement already satisfied: sentry-sdk>=1.0.0 in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from wandb) (1.25.1)\n",
+      "Requirement already satisfied: requests<3,>=2.0.0 in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from wandb) (2.28.1)\n",
+      "Requirement already satisfied: setuptools in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from wandb) (65.5.0)\n",
+      "Requirement already satisfied: Click!=8.0.0,>=7.0 in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from wandb) (8.1.3)\n",
+      "Requirement already satisfied: PyYAML in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from wandb) (6.0)\n",
+      "Requirement already satisfied: typing-extensions in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from wandb) (4.3.0)\n",
+      "Requirement already satisfied: colorama in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from Click!=8.0.0,>=7.0->wandb) (0.4.5)\n",
+      "Requirement already satisfied: six>=1.4.0 in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)\n",
+      "Requirement already satisfied: gitdb<5,>=4.0.1 in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from GitPython!=3.1.29,>=1.0.0->wandb) (4.0.10)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from requests<3,>=2.0.0->wandb) (3.4)\n",
+      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from requests<3,>=2.0.0->wandb) (1.26.12)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from requests<3,>=2.0.0->wandb) (2022.12.7)\n",
+      "Requirement already satisfied: charset-normalizer<3,>=2 in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from requests<3,>=2.0.0->wandb) (2.0.4)\n",
+      "Requirement already satisfied: smmap<6,>=3.0.1 in c:\\users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.0)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": "True"
+     },
+     "execution_count": 64,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "!pip install wandb\n",
    "import wandb\n",
@@ -277,7 +317,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 46,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
@@ -313,7 +353,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 47,
   "metadata": {
    "id": "S_hdzQw7SJcf",
    "pycharm": {
@@ -366,7 +406,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 48,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
@@ -419,7 +459,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 49,
   "metadata": {
    "id": "F1B-z30LSJch",
    "pycharm": {
@@ -446,7 +486,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 50,
   "metadata": {
    "id": "CdN1RkZISJci",
    "pycharm": {
@@ -476,7 +516,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 51,
   "metadata": {
    "id": "ewZoXDzfSJcj",
    "pycharm": {
@@ -485,9 +525,9 @@
   },
   "outputs": [],
   "source": [
-    "train_split_percentage = 100  # percentage of SPLIT\n",
-    "validate_split_percentage = 10\n",
-    "test_split_percentage = 10"
+    "train_split_percentage = 1  # percentage of SPLIT\n",
+    "validate_split_percentage = 1\n",
+    "test_split_percentage = 1"
   ]
  },
  {
@@ -503,7 +543,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 52,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
@@ -530,7 +570,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 53,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
@@ -561,15 +601,15 @@
      "Adding time to model output: True\n",
      "\n",
      "Dataset configuration:\n",
-      "Train Split Percentage: 100\n",
-      "Validation Split Percentage: 10\n",
-      "Test Split Percentage: 10\n",
+      "Train Split Percentage: 1\n",
+      "Validation Split Percentage: 1\n",
+      "Test Split Percentage: 1\n",
      "\n",
      "Training configuration:\n",
      "Number of training epochs: 8\n",
      "Number of k-folds: 2\n",
      "Batch size: 64\n",
-      "Mixed Precision: 16\n",
+      "Mixed Precision: 16-mixed\n",
      "Using Lightning: True\n"
     ]
    }
@@ -618,7 +658,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 54,
   "metadata": {
    "id": "AeBz4MDhSJcl",
    "pycharm": {
@@ -644,7 +684,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 55,
   "metadata": {
    "id": "VE40qSLQSJcl",
    "pycharm": {
@@ -670,7 +710,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 56,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
@@ -696,7 +736,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 57,
   "metadata": {
    "id": "r4QqkQRHSJcn",
    "pycharm": {
@@ -719,7 +759,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 58,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
@@ -746,45 +786,21 @@
     "output_type": "stream",
     "text": [
      "Dataset configuration:\n",
-      "Train Split Percentage: 100\n",
-      "Validation Split Percentage: 10\n",
-      "Test Split Percentage: 10\n",
+      "Train Split Percentage: 1\n",
+      "Validation Split Percentage: 1\n",
+      "Test Split Percentage: 1\n",
      "\n",
-      "Loading cnn_dailymail dataset 3.0.0 with split type: train[:100%]\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset cnn_dailymail (C:/Users/uwu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loading cnn_dailymail dataset 3.0.0 with split type: validation[:10%]\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset cnn_dailymail (C:/Users/uwu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loading cnn_dailymail dataset 3.0.0 with split type: test[:10%]\n"
+      "Loading cnn_dailymail dataset 3.0.0 with split type: train[:1%]\n",
+      "Loading cnn_dailymail dataset 3.0.0 with split type: validation[:1%]\n",
+      "Loading cnn_dailymail dataset 3.0.0 with split type: test[:1%]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
+      "Found cached dataset cnn_dailymail (C:/Users/uwu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)\n",
+      "Found cached dataset cnn_dailymail (C:/Users/uwu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)\n",
      "Found cached dataset cnn_dailymail (C:/Users/uwu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)\n"
     ]
    }
@@ -812,7 +828,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 59,
   "metadata": {
    "id": "6KCU1KIcSJco",
    "pycharm": {
@@ -837,7 +853,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 59,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
@@ -872,7 +888,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 60,
   "metadata": {
    "id": "3RcbH9C6SJcp",
    "pycharm": {
@@ -895,7 +911,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 61,
   "metadata": {
    "id": "ilLDPafbSJcq",
    "pycharm": {
@@ -921,7 +937,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 62,
   "metadata": {
    "id": "hf3b7EULSJcq",
    "pycharm": {
@@ -960,7 +976,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 63,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
@@ -1028,96 +1044,52 @@
      "Adding time to model output: True\n",
      "\n",
      "Dataset configuration:\n",
-      "Train Split Percentage: 100\n",
-      "Validation Split Percentage: 10\n",
-      "Test Split Percentage: 10\n",
+      "Train Split Percentage: 1\n",
+      "Validation Split Percentage: 1\n",
+      "Test Split Percentage: 1\n",
      "\n",
      "Training configuration:\n",
      "Number of training epochs: 8\n",
      "Number of k-folds: 2\n",
      "Batch size: 64\n",
-      "Mixed Precision: 16\n",
+      "Mixed Precision: 16-mixed\n",
+      "Using Lightning: True\n",
+      "Loading cnn_dailymail dataset 3.0.0 with split type: train[:1%]\n",
+      "Loading cnn_dailymail dataset 3.0.0 with split type: validation[:1%]\n",
+      "Loading cnn_dailymail dataset 3.0.0 with split type: test[:1%]\n",
+      "Pad token is: 0\n",
+      "Pad token is: 0\n",
+      "Pad token is: 0\n",
+      "Pad token is: 0\n",
+      "Program configuration:\n",
+      "Verbose Level: 1\n",
+      "Adding time to model output: True\n",
+      "\n",
+      "Dataset configuration:\n",
+      "Train Split Percentage: 1\n",
+      "Validation Split Percentage: 1\n",
+      "Test Split Percentage: 1\n",
+      "\n",
+      "Training configuration:\n",
+      "Number of training epochs: 8\n",
+      "Number of k-folds: 2\n",
+      "Batch size: 64\n",
+      "Mixed Precision: 16-mixed\n",
      "Using Lightning: True\n"
     ]
    },
    {
     "data": {
-      "text/html": [
-       "<div style=\"display:none\">\n",
-       "                <audio onended=\"this.parentNode.removeChild(this)\"  controls=\"controls\" autoplay=\"autoplay\">\n",
-       "                    <source src=\"data:audio/wav;base64,UklGRl4RAABXQVZFZm10IBAAAAABAAEARKwAAIhYAQACABAAZGF0YToRAAAAAAUIAxDwF8YfeycJL2c2jz15RB1LdlF9VyxdfmJsZ/JrC3Czc+d2pHnme6t98X64f/5/xH8Jf859FHzdeSx3A3RlcFZs2Wf1YqxdBVgGUrRLFkUzPhA3ty8tKHsgqBi8EMAIugC1+Lfwx+jv4DfZpdFCyhXDJrx6tRqvC6lUo/qdA5lzlFCQnYxeiZeGSoR5gieBVYADgDGA4YAQgr+D6oWQiK+LQo9Hk7qXlZzVoXOna621s026KsFHyJzPItfR3qHmi+6G9ov+kAaQDoEWWx4XJq0tFDVGPDxD7klVUGtWK1yNYY5mJ2tUbxJzW3YteYZ7Yn2/fp5/+3/YfzR/EH5ufE56snefdBZxG22yaOBjqV4TWSRT4ExQRng/YTgRMY8p5CEWGi4SNAowAir6KvI36lrim9oC05bLX8Rjvau2PbAfqlek7J7imUCVCJFBjeyJD4eshMSCW4FxgAiAH4C3gNCBZ4N8hQyIFYuTjoSS45asm9mgZ6ZPrIuyFbnmv/jGQ87B1WjdM+UZ7RL1Ff0bBRwNERXwHLIkTyy/M/w6/UG7SDFPVlUmW5pgrWVZappubHLLdbN4InsVfYl+f3/0f+h/XH9PfsN8uno1eDd1w3HcbYdpyGSjXx5aPlQKTodHu0CvOWky8CpLI4MboBOpC6YDoPud86jrxuMB3GHU7MyqxaS+37djsTWrXqXhn8WaEJbEkeiNf4qMhxKFFIOTgZKAEYARgJKAk4EUgxKFjId/iuiNxJEQlsWa4Z9epTWrY7Hft6S+qsXszGHUAdzG46jrnfOg+6YDqQugE4MbSyPwKmkyrzm7QIdHCk4+VB5ao1/IZIdp3G3DcTd1NXi6esN8T35cf+h/9H9/f4l+FX0ie7N4y3VscppuWWqtZZpgJltWVTFPu0j9Qfw6vzNPLLIk8BwRFRwNGwUV/RL1Ge0z5WjdwdVDzvjG5r8VuYuyT6xnptmgrJvjloSSk44ViwyIfIVng9CBt4AfgAiAcYBbgcSCrIQPh+yJQY0IkUCV4pnsnlekH6o9sKu2Y71fxJbLAtOb2lriN+oq8ir6MAI0Ci4SFhrkIY8pETFhOHg/UEbgTCRTE1mpXuBjsmgbbRZxn3Syd056bnwQfjR/2H/7f55/v35ifYZ7LXlbdhJzVG8na45mjWErXGtWVVDuSTxDRjwUNa0tFyZbHoEWkA6QBov+hvaL7qHm0d4i15zPR8gqwU26tbNrrXOn1aGVnLqXR5NCj6+LkIjqhb+DEILhgDGAA4BVgCeBeYJKhJeGXomdjFCQc5QDmfqdVKMLqRqverUmvBXDQsql0TfZ7+DH6Lfwtfi6AMAIvBCoGHsgLSi3LxA3Mz4WRbRLBlIFWKxd9WLZZ1ZsZXADdCx33XkUfM59CX/Ef/5/uH/xfqt95nukeed2s3MLcPJrbGd+YixdfVd2UR1LeUSPPWc2CS97J8Yf8BcDEAUIAAD79/3vEOg64IXY99CZyXHCh7vjtIqug6jUooKdlJgOlPWPTYwZiVyGGoRVgg+BSIACgDyA94AyguyDI4bUiP2Lm4+qkyeYC51Uovun+q1MtOq6zcHwyEnQ09eF31jnRO9A90b/SwdJDzkXER/JJlsuvjXrPNpDhkrmUPVWrFwGYv1mjWuwb2NzonZpebZ7h33Zfqt//X/Pfx9/8H1BfBZ6cHdRdL5wuWxGaGtjK16NWJVSS0yzRdY+uTdkMN4oLyFfGXURegl1AXD5cPF/6aXh6dlT0uzKusPEvBK2q6+VqdWjc55ymdmUrJDujKWJ04Z6hJ6CQYFigAWAKIDMgPCBkoOyhU6IYYvqjuWSTpcgnFeh7abcrCCzsLmIwJ/H785x1hze6uXS7cz10P3WBdYNyRWmHWUl/ixqNKE7nUJVScNP4VWpWxRhHmbAavhuv3IUdvF4VHs8faV+j3/4f+F/SX8wfpl8hHr0d+t0bXF8bR1pVGQnX5lZsVN1TetGGkAIOb0xPyqYIs0a5xLuCusC5frk8u/qEONO27HTQcwExQO+RbfPsKqq2qRmn1Oap5VmkZSNNYpNh96E64J3gYGADIAYgKSAsYE9g0aFy4fJij2OJJJ5ljibXaDipcKr9rF5uEW/UcaXzRDVtdx95GDsV/Ra/GAEYwxYFDoc/yOfKxQzVjpcQSFInU7LVKJaH2A7ZfBpPG4YcoF1dHjueux8bX5uf+9/739uf21+7HzuenR4gXUYcjxu8Gk7ZR9golrLVJ1OIUhcQVY6FDOfK/8jOhxYFGMMYARa/Ff0YOx95LXcENWXzVHGRb95uPaxwqvipV2gOJt5liSSPY7JisuHRoU9g7GBpIAYgAyAgYB3geuC3oRNhzWKlI1mkaeVU5pmn9qkqqrPsEW3A74ExUHMsdNO2xDj7+rk8uX66wLuCucSzRqYIj8qvTEIORpA60Z1TbFTmVknX1RkHWl8bW1x63T0d4R6mXwwfkl/4X/4f49/pX48fVR78XgUdr9y+G7Aah5mFGGpW+FVw09VSZ1CoTtqNP4sZSWmHckV1g3WBdD9zPXS7erlHN5x1u/On8eIwLC5ILPcrO2mV6EgnE6X5ZLqjmGLToiyhZKD8IHMgCiABYBigEGBnoJ6hNOGpYnujKyQ2ZRymXOe1aOVqauvErbEvLrD7MpT0unZpeF/6XDxcPl1AXoJdRFfGS8h3ihkMLk31j6zRUtMlVKNWCtea2NGaLlsvnBRdHB3FnpBfPB9H3/Pf/1/q3/Zfod9tntpeaJ2Y3Owb41r/WYGYqxc9VbmUIZK2kPrPL41Wy7JJhEfORdJD0sHRv9A90TvWOeF39PXSdDwyM3B6rpMtPqt+6dUogudJ5iqk5uP/YvUiCOG7IMygveAPIACgEiAD4FVghqEXIYZiU2M9Y8OlJSYgp3UooOoiq7jtIe7ccKZyffQhdg64BDo/e/79wAABQgDEPAXxh97JwkvZzaPPXlEHUt2UX1XLF1+Ymxn8msLcLNz53akeeZ7q33xfrh//n/Efwl/zn0UfN15LHcDdGVwVmzZZ/VirF0FWAZStEsWRTM+EDe3Ly0oeyCoGLwQwAi6ALX4t/DH6O/gN9ml0ULKFcMmvHq1Gq8LqVSj+p0DmXOUUJCdjF6Jl4ZKhHmCJ4FVgAOAMYDhgBCCv4PqhZCIr4tCj0eTupeVnNWhc6drrbWzTboqwUfInM8i19HeoeaL7ob2i/6QBpAOgRZbHhcmrS0UNUY8PEPuSVVQa1YrXI1hjmYna1RvEnNbdi15hntifb9+nn/7f9h/NH8Qfm58Tnqyd590FnEbbbJo4GOpXhNZJFPgTFBGeD9hOBExjynkIRYaLhI0CjACKvoq8jfqWuKb2gLTlstfxGO9q7Y9sB+qV6TsnuKZQJUIkUGN7IkPh6yExIJbgXGACIAfgLeA0IFng3yFDIgVi5OOhJLjlqyb2aBnpk+si7IVuea/+MZDzsHVaN0z5RntEvUV/RsFHA0RFfAcsiRPLL8z/Dr9QbtIMU9WVSZbmmCtZVlqmm5scst1s3giexV9iX5/f/R/6H9cf09+w3y6ejV4N3XDcdxth2nIZKNfHlo+VApOh0e7QK85aTLwKksjgxugE6kLpgOg+53zqOvG4wHcYdTszKrFpL7ft2OxNatepeGfxZoQlsSR6I1/ioyHEoUUg5OBkoARgBGAkoCTgRSDEoWMh3+K6I3EkRCWxZrhn16lNatjsd+3pL6qxezMYdQB3MbjqOud86D7pgOpC6ATgxtLI/AqaTKvObtAh0cKTj5UHlqjX8hkh2ncbcNxN3U1eLp6w3xPflx/6H/0f39/iX4VfSJ7s3jLdWxymm5Zaq1lmmAmW1ZVMU+7SP1B/Dq/M08ssiTwHBEVHA0bBRX9EvUZ7TPlaN3B1UPO+MbmvxW5i7JPrGem2aCsm+OWhJKTjhWLDIh8hWeD0IG3gB+ACIBxgFuBxIKshA+H7IlBjQiRQJXimeyeV6Qfqj2wq7ZjvV/ElssC05vaWuI36iryKvowAjQKLhIWGuQhjykRMWE4eD9QRuBMJFMTWale4GOyaBttFnGfdLJ3TnpufBB+NH/Yf/t/nn+/fmJ9hnsteVt2EnNUbydrjmaNYStca1ZVUO5JPENGPBQ1rS0XJlsegRaQDpAGi/6G9ovuoebR3iLXnM9HyCrBTbq1s2utc6fVoZWcupdHk0KPr4uQiOqFv4MQguGAMYADgFWAJ4F5gkqEl4ZeiZ2MUJBzlAOZ+p1UowupGq96tSa8FcNCyqXRN9nv4Mfot/C1+LoAwAi8EKgYeyAtKLcvEDczPhZFtEsGUgVYrF31YtlnVmxlcAN0LHfdeRR8zn0Jf8R//n+4f/F+q33me6R553azcwtw8mtsZ35iLF19V3ZRHUt5RI89ZzYJL3snxh/wFwMQBQgAAPv3/e8Q6Drghdj30JnJccKHu+O0iq6DqNSigp2UmA6U9Y9NjBmJXIYahFWCD4FIgAGAPID3gDKC7IMjhtSI/Yubj6qTJ5gLnVSi+6f6rUy06rrNwfDISdDT14XfWOdE70D3Rv9LB0kPORcRH8kmWy6+Nes82kOGSuZQ9VasXAZi/WaNa7BvY3Oidml5tnuHfdl+q3/9f89/H3/wfUF8Fnpwd1F0vnC5bEZoa2MrXo1YlVJLTLNF1j65N2Qw3igvIV8ZdRF6CXUBcPlw8X/ppeHp2VPS7Mq6w8S8Erarr5Wp1aNznnKZ2ZSskO6MpYnThnqEnoJBgWKABYAogMyA8IGSg7KFTohhi+qO5ZJOlyCcV6HtptysILOwuYjAn8fvznHWHN7q5dLtzPXQ/dYF1g3JFaYdZSX+LGo0oTudQlVJw0/hValbFGEeZsBq+G6/chR28XhUezx9pX6Pf/h/4X9JfzB+mXyEevR363RtcXxtHWlUZCdfmVmxU3VN60YaQAg5vTE/KpgizRrnEu4K6wLl+uTy7+oQ407bsdNBzATFA75Ft8+wqqrapGafU5qnlWaRlI01ik2H3oTrgneBgYAMgBiApICxgT2DRoXLh8mKPY4kknmWOJtdoOKlwqv2sXm4Rb9RxpfNENW13H3kYOxX9Fr8YARjDFgUOhz/I58rFDNWOlxBIUidTstUolofYDtl8Gk8bhhygXV0eO567Hxtfm5/73/vf25/bX7sfO56dHiBdRhyPG7waTtlH2CiWstUnU4hSFxBVjoUM58r/yM6HFgUYwxgBFr8V/Rg7H3ktdwQ1ZfNUcZFv3m49rHCq+KlXaA4m3mWJJI9jsmKy4dGhT2DsYGkgBiADICBgHeB64LehE2HNYqUjWaRp5VTmmaf2qSqqs+wRbcDvgTFQcyx007bEOPv6uTy5frrAu4K5xLNGpgiPyq9MQg5GkDrRnVNsVOZWSdfVGQdaXxtbXHrdPR3hHqZfDB+SX/hf/h/j3+lfjx9VHvxeBR2v3L4bsBqHmYUYalb4VXDT1VJnUKhO2o0/ixlJaYdyRXWDdYF0P3M9dLt6uUc3nHW786fx4jAsLkgs9ys7aZXoSCcTpflkuqOYYtOiLKFkoPwgcyAKIAFgGKAQYGegnqE04alie6MrJDZlHKZc57Vo5Wpq68StsS8usPsylPS6dml4X/pcPFw+XUBegl1EV8ZLyHeKGQwuTfWPrNFS0yVUo1YK15rY0ZouWy+cFF0cHcWekF88H0ff89//X+rf9l+h322e2l5onZjc7BvjWv9ZgZirFz1VuZQhkraQ+s8vjVbLskmER85F0kPSwdG/0D3RO9Y54Xf09dJ0PDIzcHquky0+q37p1SiC50nmKqTm4/9i9SII4bsgzKC94A8gAKASIAPgVWCGoRchhmJTYz1jw6UlJiCndSig6iKruO0h7txwpnJ99CF2DrgEOj97/v3AAA=\" type=\"audio/wav\" />\n",
-       "                    Your browser does not support the audio element.\n",
-       "                </audio>\n",
-       "              </div>"
-      ],
-      "text/plain": [
-       "<jupyter_beeper.Beeper.InvisibleAudio object>"
-      ]
+      "text/plain": "<IPython.core.display.HTML object>",
+      "text/html": ""
     },
     "metadata": {},
     "output_type": "display_data"
    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']\n",
-      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']\n",
-      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']\n",
-      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Training BertBiLSTM\n",
-      "Available GPUs: 1\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages\\lightning_fabric\\connector.py:555: UserWarning: 16 is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!\n",
-      "  rank_zero_warn(\n",
-      "Using 16bit Automatic Mixed Precision (AMP)\n",
-      "GPU available: True (cuda), used: True\n",
-      "TPU available: False, using: 0 TPU cores\n",
-      "IPU available: False, using: 0 IPUs\n",
-      "HPU available: False, using: 0 HPUs\n",
-      "C:\\Users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages\\pytorch_lightning\\trainer\\connectors\\logger_connector\\logger_connector.py:67: UserWarning: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n",
-      "  warning_cache.warn(\n",
-      "C:\\Users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages\\pytorch_lightning\\callbacks\\model_checkpoint.py:615: UserWarning: Checkpoint directory C:\\Users\\uwu\\PycharmProjects\\COMP3200\\Models exists and is not empty.\n",
-      "  rank_zero_warn(f\"Checkpoint directory {dirpath} exists and is not empty.\")\n",
-      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
-      "\n",
-      "  | Name      | Type       | Params\n",
-      "-----------------------------------------\n",
-      "0 | model     | BertBiLSTM | 165 M \n",
-      "1 | criterion | NLLLoss    | 0     \n",
-      "-----------------------------------------\n",
-      "56.4 M    Trainable params\n",
-      "109 M     Non-trainable params\n",
-      "165 M     Total params\n",
-      "663.376   Total estimated model params size (MB)\n"
-     ]
-    },
    {
     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "cddf8252852d4c4db3a905e2d7c4b8f4",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Sanity Checking: 0it [00:00, ?it/s]"
-      ]
+      "text/plain": "<IPython.core.display.HTML object>",
+      "text/html": ""
     },
     "metadata": {},
     "output_type": "display_data"
@@ -1126,62 +1098,39 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "C:\\Users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages\\pytorch_lightning\\trainer\\call.py:52: UserWarning: Detected KeyboardInterrupt, attempting graceful shutdown...\n",
-      "  rank_zero_warn(\"Detected KeyboardInterrupt, attempting graceful shutdown...\")\n",
-      "Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x0000020B93049EE0>\n",
-      "Traceback (most recent call last):\n",
-      "  File \"C:\\Users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages\\torch\\utils\\data\\dataloader.py\", line 1478, in __del__\n",
-      "    self._shutdown_workers()\n",
-      "  File \"C:\\Users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages\\torch\\utils\\data\\dataloader.py\", line 1436, in _shutdown_workers\n",
-      "    if self._persistent_workers or self._workers_status[worker_id]:\n",
-      "AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status'\n",
-      "C:\\Users\\uwu\\miniconda3\\envs\\uni\\lib\\site-packages\\lightning_fabric\\connector.py:555: UserWarning: 16 is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!\n",
-      "  rank_zero_warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Training BertDoubleDense\n",
-      "Available GPUs: 1\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using 16bit Automatic Mixed Precision (AMP)\n",
-      "GPU available: True (cuda), used: True\n",
-      "TPU available: False, using: 0 TPU cores\n",
-      "IPU available: False, using: 0 IPUs\n",
-      "HPU available: False, using: 0 HPUs\n",
-      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
-      "\n",
-      "  | Name      | Type            | Params\n",
-      "----------------------------------------------\n",
-      "0 | model     | BertDoubleDense | 133 M \n",
-      "1 | criterion | NLLLoss         | 0     \n",
-      "----------------------------------------------\n",
-      "24.1 M    Trainable params\n",
-      "109 M     Non-trainable params\n",
-      "133 M     Total params\n",
-      "534.177   Total estimated model params size (MB)\n"
+      "Found cached dataset cnn_dailymail (C:/Users/uwu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)\n",
+      "Found cached dataset cnn_dailymail (C:/Users/uwu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)\n",
+      "Found cached dataset cnn_dailymail (C:/Users/uwu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)\n",
+      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
+      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
+      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Some weights of BertModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.6.crossattention.self.key.bias', 'bert.encoder.layer.10.crossattention.output.dense.bias', 'bert.encoder.layer.2.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.5.crossattention.self.query.weight', 'bert.encoder.layer.4.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.2.crossattention.self.key.bias', 'bert.encoder.layer.6.crossattention.output.dense.weight', 'bert.encoder.layer.10.crossattention.self.value.bias', 'bert.encoder.layer.11.crossattention.self.value.bias', 'bert.encoder.layer.8.crossattention.self.value.bias', 'bert.encoder.layer.4.crossattention.self.key.bias', 'bert.encoder.layer.7.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.5.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.11.crossattention.output.dense.weight', 'bert.encoder.layer.7.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.4.crossattention.self.query.bias', 'bert.encoder.layer.9.crossattention.output.dense.bias', 'bert.encoder.layer.2.crossattention.output.dense.weight', 'bert.encoder.layer.4.crossattention.self.query.weight', 'bert.encoder.layer.10.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.5.crossattention.self.key.weight', 'bert.encoder.layer.7.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.9.crossattention.self.key.bias', 'bert.encoder.layer.11.crossattention.self.value.weight', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.3.crossattention.self.query.bias', 'bert.encoder.layer.10.crossattention.self.value.weight', 'bert.encoder.layer.3.crossattention.self.key.bias', 'bert.encoder.layer.9.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.2.crossattention.self.query.bias', 'bert.encoder.layer.10.crossattention.self.key.bias', 'bert.encoder.layer.3.crossattention.self.value.bias', 'bert.encoder.layer.11.crossattention.self.query.bias', 'bert.encoder.layer.3.crossattention.self.key.weight', 'bert.encoder.layer.5.crossattention.self.key.bias', 'bert.encoder.layer.9.crossattention.self.value.weight', 'bert.encoder.layer.10.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.9.crossattention.self.value.bias', 'bert.encoder.layer.3.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.9.crossattention.self.key.weight', 'bert.encoder.layer.6.crossattention.self.value.bias', 'bert.encoder.layer.9.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.9.crossattention.output.dense.weight', 'bert.encoder.layer.8.crossattention.self.query.weight', 'bert.encoder.layer.1.crossattention.self.query.weight', 'bert.encoder.layer.10.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.3.crossattention.output.dense.weight', 'bert.encoder.layer.4.crossattention.self.key.weight', 'bert.encoder.layer.4.crossattention.output.dense.bias', 'bert.encoder.layer.10.crossattention.output.dense.weight', 'bert.encoder.layer.10.crossattention.self.key.weight', 'bert.encoder.layer.8.crossattention.self.key.weight', 'bert.encoder.layer.9.crossattention.self.query.bias', 'bert.encoder.layer.5.crossattention.self.query.bias', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.8.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.value.bias', 'bert.encoder.layer.2.crossattention.output.dense.bias', 'bert.encoder.layer.11.crossattention.self.key.bias', 'bert.encoder.layer.7.crossattention.self.key.weight', 'bert.encoder.layer.11.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.2.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.3.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.8.crossattention.self.key.bias', 'bert.encoder.layer.6.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.1.crossattention.self.key.weight', 'bert.encoder.layer.10.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.5.crossattention.output.dense.bias', 'bert.encoder.layer.3.crossattention.output.dense.bias', 'bert.encoder.layer.8.crossattention.output.dense.bias', 'bert.encoder.layer.6.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.6.crossattention.self.query.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.4.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.7.crossattention.self.query.weight', 'bert.encoder.layer.6.crossattention.self.query.bias', 'bert.encoder.layer.11.crossattention.output.dense.bias', 'bert.encoder.layer.3.crossattention.self.query.weight', 'bert.encoder.layer.8.crossattention.self.value.weight', 'bert.encoder.layer.9.crossattention.self.query.weight', 'bert.encoder.layer.8.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.4.crossattention.self.value.weight', 'bert.encoder.layer.4.crossattention.output.dense.weight', 'bert.encoder.layer.2.crossattention.self.key.weight', 'bert.encoder.layer.5.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.self.query.bias', 'bert.encoder.layer.2.crossattention.self.value.weight', 'bert.encoder.layer.7.crossattention.self.value.weight', 'bert.encoder.layer.8.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.11.crossattention.self.query.weight', 'bert.encoder.layer.11.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.7.crossattention.output.dense.weight', 'bert.encoder.layer.2.crossattention.self.value.bias', 'bert.encoder.layer.4.crossattention.self.value.bias', 'bert.encoder.layer.5.crossattention.self.value.weight', 'bert.encoder.layer.7.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.6.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.2.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.7.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.11.crossattention.self.key.weight', 'bert.encoder.layer.5.crossattention.self.value.bias', 'bert.encoder.layer.7.crossattention.self.query.bias', 'bert.encoder.layer.6.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.self.value.weight', 'bert.encoder.layer.8.crossattention.self.query.bias', 'bert.encoder.layer.6.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.3.crossattention.self.value.weight', 'bert.encoder.layer.5.crossattention.output.dense.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
+      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Some weights of BertModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.6.crossattention.self.key.bias', 'bert.encoder.layer.10.crossattention.output.dense.bias', 'bert.encoder.layer.2.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.5.crossattention.self.query.weight', 'bert.encoder.layer.4.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.2.crossattention.self.key.bias', 'bert.encoder.layer.6.crossattention.output.dense.weight', 'bert.encoder.layer.10.crossattention.self.value.bias', 'bert.encoder.layer.11.crossattention.self.value.bias', 'bert.encoder.layer.8.crossattention.self.value.bias', 'bert.encoder.layer.4.crossattention.self.key.bias', 'bert.encoder.layer.7.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.5.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.11.crossattention.output.dense.weight', 'bert.encoder.layer.7.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.4.crossattention.self.query.bias', 'bert.encoder.layer.9.crossattention.output.dense.bias', 'bert.encoder.layer.2.crossattention.output.dense.weight', 'bert.encoder.layer.4.crossattention.self.query.weight', 'bert.encoder.layer.10.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.5.crossattention.self.key.weight', 'bert.encoder.layer.7.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.9.crossattention.self.key.bias', 'bert.encoder.layer.11.crossattention.self.value.weight', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.3.crossattention.self.query.bias', 'bert.encoder.layer.10.crossattention.self.value.weight', 'bert.encoder.layer.3.crossattention.self.key.bias', 'bert.encoder.layer.9.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.2.crossattention.self.query.bias', 'bert.encoder.layer.10.crossattention.self.key.bias', 'bert.encoder.layer.3.crossattention.self.value.bias', 'bert.encoder.layer.11.crossattention.self.query.bias', 'bert.encoder.layer.3.crossattention.self.key.weight', 'bert.encoder.layer.5.crossattention.self.key.bias', 'bert.encoder.layer.9.crossattention.self.value.weight', 'bert.encoder.layer.10.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.9.crossattention.self.value.bias', 'bert.encoder.layer.3.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.9.crossattention.self.key.weight', 'bert.encoder.layer.6.crossattention.self.value.bias', 'bert.encoder.layer.9.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.9.crossattention.output.dense.weight', 'bert.encoder.layer.8.crossattention.self.query.weight', 'bert.encoder.layer.1.crossattention.self.query.weight', 'bert.encoder.layer.10.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.3.crossattention.output.dense.weight', 'bert.encoder.layer.4.crossattention.self.key.weight', 'bert.encoder.layer.4.crossattention.output.dense.bias', 'bert.encoder.layer.10.crossattention.output.dense.weight', 'bert.encoder.layer.10.crossattention.self.key.weight', 'bert.encoder.layer.8.crossattention.self.key.weight', 'bert.encoder.layer.9.crossattention.self.query.bias', 'bert.encoder.layer.5.crossattention.self.query.bias', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.8.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.value.bias', 'bert.encoder.layer.2.crossattention.output.dense.bias', 'bert.encoder.layer.11.crossattention.self.key.bias', 'bert.encoder.layer.7.crossattention.self.key.weight', 'bert.encoder.layer.11.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.2.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.3.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.8.crossattention.self.key.bias', 'bert.encoder.layer.6.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.1.crossattention.self.key.weight', 'bert.encoder.layer.10.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.5.crossattention.output.dense.bias', 'bert.encoder.layer.3.crossattention.output.dense.bias', 'bert.encoder.layer.8.crossattention.output.dense.bias', 'bert.encoder.layer.6.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.6.crossattention.self.query.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.4.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.7.crossattention.self.query.weight', 'bert.encoder.layer.6.crossattention.self.query.bias', 'bert.encoder.layer.11.crossattention.output.dense.bias', 'bert.encoder.layer.3.crossattention.self.query.weight', 'bert.encoder.layer.8.crossattention.self.value.weight', 'bert.encoder.layer.9.crossattention.self.query.weight', 'bert.encoder.layer.8.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.4.crossattention.self.value.weight', 'bert.encoder.layer.4.crossattention.output.dense.weight', 'bert.encoder.layer.2.crossattention.self.key.weight', 'bert.encoder.layer.5.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.self.query.bias', 'bert.encoder.layer.2.crossattention.self.value.weight', 'bert.encoder.layer.7.crossattention.self.value.weight', 'bert.encoder.layer.8.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.11.crossattention.self.query.weight', 'bert.encoder.layer.11.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.7.crossattention.output.dense.weight', 'bert.encoder.layer.2.crossattention.self.value.bias', 'bert.encoder.layer.4.crossattention.self.value.bias', 'bert.encoder.layer.5.crossattention.self.value.weight', 'bert.encoder.layer.7.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.6.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.2.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.7.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.11.crossattention.self.key.weight', 'bert.encoder.layer.5.crossattention.self.value.bias', 'bert.encoder.layer.7.crossattention.self.query.bias', 'bert.encoder.layer.6.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.self.value.weight', 'bert.encoder.layer.8.crossattention.self.query.bias', 'bert.encoder.layer.6.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.3.crossattention.self.value.weight', 'bert.encoder.layer.5.crossattention.output.dense.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
+      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Some weights of BertModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.6.crossattention.self.key.bias', 'bert.encoder.layer.10.crossattention.output.dense.bias', 'bert.encoder.layer.2.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.5.crossattention.self.query.weight', 'bert.encoder.layer.4.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.2.crossattention.self.key.bias', 'bert.encoder.layer.6.crossattention.output.dense.weight', 'bert.encoder.layer.10.crossattention.self.value.bias', 'bert.encoder.layer.11.crossattention.self.value.bias', 'bert.encoder.layer.8.crossattention.self.value.bias', 'bert.encoder.layer.4.crossattention.self.key.bias', 'bert.encoder.layer.7.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.5.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.11.crossattention.output.dense.weight', 'bert.encoder.layer.7.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.4.crossattention.self.query.bias', 'bert.encoder.layer.9.crossattention.output.dense.bias', 'bert.encoder.layer.2.crossattention.output.dense.weight', 'bert.encoder.layer.4.crossattention.self.query.weight', 'bert.encoder.layer.10.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.5.crossattention.self.key.weight', 'bert.encoder.layer.7.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.9.crossattention.self.key.bias', 'bert.encoder.layer.11.crossattention.self.value.weight', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.3.crossattention.self.query.bias', 'bert.encoder.layer.10.crossattention.self.value.weight', 'bert.encoder.layer.3.crossattention.self.key.bias', 'bert.encoder.layer.9.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.2.crossattention.self.query.bias', 'bert.encoder.layer.10.crossattention.self.key.bias', 'bert.encoder.layer.3.crossattention.self.value.bias', 'bert.encoder.layer.11.crossattention.self.query.bias', 'bert.encoder.layer.3.crossattention.self.key.weight', 'bert.encoder.layer.5.crossattention.self.key.bias', 'bert.encoder.layer.9.crossattention.self.value.weight', 'bert.encoder.layer.10.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.9.crossattention.self.value.bias', 'bert.encoder.layer.3.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.9.crossattention.self.key.weight', 'bert.encoder.layer.6.crossattention.self.value.bias', 'bert.encoder.layer.9.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.9.crossattention.output.dense.weight', 'bert.encoder.layer.8.crossattention.self.query.weight', 'bert.encoder.layer.1.crossattention.self.query.weight', 'bert.encoder.layer.10.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.3.crossattention.output.dense.weight', 'bert.encoder.layer.4.crossattention.self.key.weight', 'bert.encoder.layer.4.crossattention.output.dense.bias', 'bert.encoder.layer.10.crossattention.output.dense.weight', 'bert.encoder.layer.10.crossattention.self.key.weight', 'bert.encoder.layer.8.crossattention.self.key.weight', 'bert.encoder.layer.9.crossattention.self.query.bias', 'bert.encoder.layer.5.crossattention.self.query.bias', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.8.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.value.bias', 'bert.encoder.layer.2.crossattention.output.dense.bias', 'bert.encoder.layer.11.crossattention.self.key.bias', 'bert.encoder.layer.7.crossattention.self.key.weight', 'bert.encoder.layer.11.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.2.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.3.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.8.crossattention.self.key.bias', 'bert.encoder.layer.6.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.1.crossattention.self.key.weight', 'bert.encoder.layer.10.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.5.crossattention.output.dense.bias', 'bert.encoder.layer.3.crossattention.output.dense.bias', 'bert.encoder.layer.8.crossattention.output.dense.bias', 'bert.encoder.layer.6.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.6.crossattention.self.query.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.4.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.7.crossattention.self.query.weight', 'bert.encoder.layer.6.crossattention.self.query.bias', 'bert.encoder.layer.11.crossattention.output.dense.bias', 'bert.encoder.layer.3.crossattention.self.query.weight', 'bert.encoder.layer.8.crossattention.self.value.weight', 'bert.encoder.layer.9.crossattention.self.query.weight', 'bert.encoder.layer.8.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.4.crossattention.self.value.weight', 'bert.encoder.layer.4.crossattention.output.dense.weight', 'bert.encoder.layer.2.crossattention.self.key.weight', 'bert.encoder.layer.5.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.self.query.bias', 'bert.encoder.layer.2.crossattention.self.value.weight', 'bert.encoder.layer.7.crossattention.self.value.weight', 'bert.encoder.layer.8.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.11.crossattention.self.query.weight', 'bert.encoder.layer.11.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.7.crossattention.output.dense.weight', 'bert.encoder.layer.2.crossattention.self.value.bias', 'bert.encoder.layer.4.crossattention.self.value.bias', 'bert.encoder.layer.5.crossattention.self.value.weight', 'bert.encoder.layer.7.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.6.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.2.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.7.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.11.crossattention.self.key.weight', 'bert.encoder.layer.5.crossattention.self.value.bias', 'bert.encoder.layer.7.crossattention.self.query.bias', 'bert.encoder.layer.6.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.self.value.weight', 'bert.encoder.layer.8.crossattention.self.query.bias', 'bert.encoder.layer.6.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.3.crossattention.self.value.weight', 'bert.encoder.layer.5.crossattention.output.dense.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    },
    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "601e8dbb8f484cec995018b06ca7c06f",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Sanity Checking: 0it [00:00, ?it/s]"
+     "ename": "AttributeError",
+     "evalue": "module 'pytorch_lightning.loggers.wandb' has no attribute 'login'",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[1;31mAttributeError\u001B[0m                            Traceback (most recent call last)",
+      "Cell \u001B[1;32mIn [63], line 45\u001B[0m\n\u001B[0;32m     42\u001B[0m val_loader \u001B[38;5;241m=\u001B[39m DataLoader(validation_dataset, batch_size\u001B[38;5;241m=\u001B[39mbatch_size, num_workers\u001B[38;5;241m=\u001B[39mnum_cpus)\n\u001B[0;32m     44\u001B[0m output_config()\n\u001B[1;32m---> 45\u001B[0m \u001B[43mwandb\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mlogin\u001B[49m()\n\u001B[0;32m     47\u001B[0m b \u001B[38;5;241m=\u001B[39m jupyter_beeper\u001B[38;5;241m.\u001B[39mBeeper()\n\u001B[0;32m     48\u001B[0m b\u001B[38;5;241m.\u001B[39mbeep()\n",
+      "\u001B[1;31mAttributeError\u001B[0m: module 'pytorch_lightning.loggers.wandb' has no attribute 'login'"
     ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
    }
   ],
   "source": [
@@ -1216,7 +1165,7 @@
    "num_cpus = os.cpu_count()\n",
    "num_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]\n",
    "\n",
-    "if num_gpus>= 8:\n",
+    "if len(num_gpus)>= 8:\n",
    "    print(\"POWAAAAAA\")\n",
    "    strategy = \"ddp_notebook\"\n",
    "else:\n",
@@ -1229,7 +1178,6 @@
    "val_loader = DataLoader(validation_dataset, batch_size=batch_size, num_workers=num_cpus)\n",
    "\n",
    "output_config()\n",
-    "wandb.login()\n",
    "\n",
    "b = jupyter_beeper.Beeper()\n",
    "b.beep()\n",
@@ -1328,20 +1276,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "metadata": {
    "id": "4pxxA9b0Xx8U"
   },
-   "outputs": [
-    {
-     "ename": "SyntaxError",
-     "evalue": "positional argument follows keyword argument (2290879988.py, line 6)",
-     "output_type": "error",
-     "traceback": [
-      "\u001B[1;36m  Cell \u001B[1;32mIn [7], line 6\u001B[1;36m\u001B[0m\n\u001B[1;33m    precision=\"16\",load_from_checkpoint(\"Models/epoch=7-val_loss=0.86-rouge=0.00.ckpt\"))\u001B[0m\n\u001B[1;37m                                                                                       ^\u001B[0m\n\u001B[1;31mSyntaxError\u001B[0m\u001B[1;31m:\u001B[0m positional argument follows keyword argument\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "model = BertLightning(BertSingleDense())\n",
    "test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_cpus)\n",

 %% Cell type:markdown id: tags:

 Memory Check

 %% Cell type:code id: tags:

 ``` python
 !nvidia-smi
 ```

 %% Output

-    Wed Jun 14 17:04:39 2023
+    Fri Jun 16 07:18:31 2023
    +---------------------------------------------------------------------------------------+
    | NVIDIA-SMI 531.29                 Driver Version: 531.29       CUDA Version: 12.1     |
    |-----------------------------------------+----------------------+----------------------+
    | GPU  Name                      TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
    | Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
    |                                         |                      |               MIG M. |
    |=========================================+======================+======================|
    |   0  NVIDIA GeForce RTX 2080 Ti    WDDM | 00000000:0E:00.0  On |                  N/A |
-    | 41%   49C    P8               43W / 260W|   2505MiB / 11264MiB |     17%      Default |
+    | 41%   45C    P8               37W / 260W|   3417MiB / 11264MiB |     23%      Default |
    |                                         |                      |                  N/A |
    +-----------------------------------------+----------------------+----------------------+
    
    +---------------------------------------------------------------------------------------+
    | Processes:                                                                            |
    |  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
    |        ID   ID                                                             Usage      |
    |=======================================================================================|
-    |    0   N/A  N/A      2412    C+G   ...inaries\Win64\EpicGamesLauncher.exe    N/A      |
    |    0   N/A  N/A      3144    C+G   ...a\Local\Mozilla Firefox\firefox.exe    N/A      |
    |    0   N/A  N/A      3400    C+G   ..._x64__kzf8qxf38zg5c\Skype\Skype.exe    N/A      |
    |    0   N/A  N/A      3752    C+G   ...GeForce Experience\NVIDIA Share.exe    N/A      |
    |    0   N/A  N/A      4240    C+G   ...1.0_x64__8wekyb3d8bbwe\Video.UI.exe    N/A      |
    |    0   N/A  N/A      6468    C+G   ....Search_cw5n1h2txyewy\SearchApp.exe    N/A      |
    |    0   N/A  N/A      6828    C+G   ...rm 2020.3.3\jbr\bin\jcef_helper.exe    N/A      |
-    |    0   N/A  N/A      9500    C+G   ....0_x64__8wekyb3d8bbwe\HxOutlook.exe    N/A      |
    |    0   N/A  N/A      9780    C+G   ..._x64__kzf8qxf38zg5c\Skype\Skype.exe    N/A      |
+    |    0   N/A  N/A     11044    C+G   ....0_x64__8wekyb3d8bbwe\HxOutlook.exe    N/A      |
    |    0   N/A  N/A     11628    C+G   C:\Windows\explorer.exe                   N/A      |
+    |    0   N/A  N/A     11652    C+G   ...61.0_x64__8wekyb3d8bbwe\GameBar.exe    N/A      |
    |    0   N/A  N/A     12416    C+G   ...2txyewy\StartMenuExperienceHost.exe    N/A      |
+    |    0   N/A  N/A     12892    C+G   ..._8wekyb3d8bbwe\Microsoft.Photos.exe    N/A      |
    |    0   N/A  N/A     14040    C+G   ...302.5.0_x64__8wekyb3d8bbwe\Time.exe    N/A      |
    |    0   N/A  N/A     14792    C+G   ...GeForce Experience\NVIDIA Share.exe    N/A      |
    |    0   N/A  N/A     16016    C+G   ...CBS_cw5n1h2txyewy\TextInputHost.exe    N/A      |
    |    0   N/A  N/A     16612    C+G   ...ft Office\root\Office16\OUTLOOK.EXE    N/A      |
    |    0   N/A  N/A     17024    C+G   ....Search_cw5n1h2txyewy\SearchApp.exe    N/A      |
-    |    0   N/A  N/A     17124    C+G   ...oogle\Chrome\Application\chrome.exe    N/A      |
    |    0   N/A  N/A     17368    C+G   ...l\Microsoft\Teams\current\Teams.exe    N/A      |
    |    0   N/A  N/A     20412    C+G   ...on\114.0.1823.43\msedgewebview2.exe    N/A      |
    |    0   N/A  N/A     20660    C+G   ...air\Corsair iCUE5 Software\iCUE.exe    N/A      |
-    |    0   N/A  N/A     23044    C+G   ...\cef\cef.win7x64\steamwebhelper.exe    N/A      |
+    |    0   N/A  N/A     23236    C+G   ...ne\Binaries\Win64\EpicWebHelper.exe    N/A      |
    |    0   N/A  N/A     23360    C+G   ...Canary\app-1.0.66\DiscordCanary.exe    N/A      |
-    |    0   N/A  N/A     24680    C+G   ...ne\Binaries\Win64\EpicWebHelper.exe    N/A      |
-    |    0   N/A  N/A     25200    C+G   ...on\wallpaper_engine\wallpaper32.exe    N/A      |
+    |    0   N/A  N/A     25140    C+G   ...on\wallpaper_engine\wallpaper32.exe    N/A      |
    |    0   N/A  N/A     25596    C+G   ...e Stream\76.0.3.0\GoogleDriveFS.exe    N/A      |
-    |    0   N/A  N/A     25952    C+G   ..._8wekyb3d8bbwe\Microsoft.Photos.exe    N/A      |
    |    0   N/A  N/A     26716    C+G   C:\Program Files\RaiderIO\RaiderIO.exe    N/A      |
    |    0   N/A  N/A     27700    C+G   ...les (x86)\Overwolf\old_Overwolf.exe    N/A      |
    |    0   N/A  N/A     28444    C+G   ...cordPTB\app-1.0.1027\DiscordPTB.exe    N/A      |
-    |    0   N/A  N/A     29192    C+G   ...les (x86)\Battle.net\Battle.net.exe    N/A      |
    |    0   N/A  N/A     31192    C+G   ...wolf\0.223.0.33\OverwolfBrowser.exe    N/A      |
    |    0   N/A  N/A     31576    C+G   C:\Program Files\NordVPN\NordVPN.exe      N/A      |
-    |    0   N/A  N/A     31956    C+G   ...ekyb3d8bbwe\PhoneExperienceHost.exe    N/A      |
    |    0   N/A  N/A     32976    C+G   ...ft Office\root\Office16\WINWORD.EXE    N/A      |
    |    0   N/A  N/A     34400    C+G   ...02.0_x86__zpdnekdrzrea0\Spotify.exe    N/A      |
    |    0   N/A  N/A     34932    C+G   ...ft Office\root\Office16\ONENOTE.EXE    N/A      |
    |    0   N/A  N/A     34944    C+G   ...5n1h2txyewy\ShellExperienceHost.exe    N/A      |
    |    0   N/A  N/A     37420    C+G   ...l\Microsoft\Teams\current\Teams.exe    N/A      |
    |    0   N/A  N/A     37968    C+G   ...al\Discord\app-1.0.9013\Discord.exe    N/A      |
    |    0   N/A  N/A     38508    C+G   ...t.LockApp_cw5n1h2txyewy\LockApp.exe    N/A      |
+    |    0   N/A  N/A     40684    C+G   ...inaries\Win64\EpicGamesLauncher.exe    N/A      |
    |    0   N/A  N/A     42416    C+G   ...ft Office\root\Office16\WINWORD.EXE    N/A      |
-    |    0   N/A  N/A     42952    C+G   ...crosoft\Edge\Application\msedge.exe    N/A      |
+    |    0   N/A  N/A     42904    C+G   ...oogle\Chrome\Application\chrome.exe    N/A      |
    |    0   N/A  N/A     44812    C+G   ...cal\Microsoft\OneDrive\OneDrive.exe    N/A      |
+    |    0   N/A  N/A     44944    C+G   ...crosoft\Edge\Application\msedge.exe    N/A      |
+    |    0   N/A  N/A     46752    C+G   ...\cef\cef.win7x64\steamwebhelper.exe    N/A      |
    |    0   N/A  N/A     47144    C+G   ...a\Local\Mozilla Firefox\firefox.exe    N/A      |
    |    0   N/A  N/A     47776    C+G   ...siveControlPanel\SystemSettings.exe    N/A      |
-    |    0   N/A  N/A     49192    C+G   ...0_x64__8wekyb3d8bbwe\HxAccounts.exe    N/A      |
-    |    0   N/A  N/A     49296    C+G   ...sair iCUE5 Software\QmlRenderer.exe    N/A      |
+    |    0   N/A  N/A     47892    C+G   ...0_x64__8wekyb3d8bbwe\HxAccounts.exe    N/A      |
+    |    0   N/A  N/A     52648      C   ...\uwu\miniconda3\envs\uni\python.exe    N/A      |
+    |    0   N/A  N/A     53120    C+G   ...sair iCUE5 Software\QmlRenderer.exe    N/A      |
+    |    0   N/A  N/A     57048    C+G   ...ager\Mendeley Reference Manager.exe    N/A      |
+    |    0   N/A  N/A     58088      C   ...\uwu\miniconda3\envs\uni\python.exe    N/A      |
    +---------------------------------------------------------------------------------------+

 %% Cell type:markdown id: tags:

 Pip Installs

 %% Cell type:code id: tags:

 ``` python
 #should be handled by requirements.txt - but isnt
 import os
 package_install_override = False
 if False and not os.path.exists("installedRepos") or package_install_override:
    !pip install absl-py
    !pip install aiohttp
    !pip install aiosignal
    !pip install async-timeout
    !pip install cachetools
    !pip install certifi
    !pip install click
    !pip install datasets
    !pip install dill
    !pip install evaluate
    !pip install filelock
    !pip install fonttools
    !pip install frozenlist
    !pip install fsspec
    !pip install google-api-core
    !pip install google-api-python-client
    !pip install google-auth
    !pip install google-auth-httplib2
    !pip install googleapis-common-protos
    !pip install httplib2
    !pip install huggingface-hub
    !pip install ipython-genutils
    !pip install joblib
    !pip install Jupyter-Beeper
    !pip install lightning-utilities
    !pip install mkl-fft
    !pip install mkl-random
    !pip install mkl-service
    !pip install mpmath
    !pip install multidict
    !pip install multiprocess
    !pip install munkres
    !pip install networkx
    !pip install nltk
    !pip install oauth2client
    !pip install pandas
    !pip install Pillow
    !pip install ply
    !pip install protobuf
    !pip install pyarrow
    !pip install pyasn1
    !pip install pyasn1-modules
    !pip install PyDrive
    !pip install pyenchant
    !pip install PyQt5
    !pip install pytorch-beam-search
    !pip install pytorch-lightning
    !pip install pywin32
    !pip install PyYAML
    !pip install pyzmq
    !pip install regex
    !pip install responses
    !pip install rouge-score
    !pip install rsa
    !pip install scikit-learn
    !pip install scipy
    !pip install sentencepiece
    !pip install seqeval
    !pip install sympy
    !pip install threadpoolctl
    !pip install tokenizers
    !pip install torch
    !pip install torch-utils
    !pip install torchaudio
    !pip install torchdata
    !pip install torchmetrics
    !pip install torchtext
    !pip install torchvision
    !pip install transformers
    !pip install uritemplate
    !pip install webencodings
    !pip install wincertstore
    !pip install xxhash
    !pip install yarl
    print("Installed all Packages!")
    f = open("installedRepos", "w")
    f.close()
 else:
    print("Packages should be installed already. If this is incorrect, change the override and re-run.")
    package_install_override = False
 ```

 %% Output

    Packages should be installed already. If this is incorrect, change the override and re-run.

 %% Cell type:markdown id: tags:

 #WandB Login

 %% Cell type:code id: tags:

 ``` python
 !pip install wandb
 import wandb
 wandb.login()
 ```

+%% Output
+
+    Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
+    Requirement already satisfied: wandb in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (0.15.4)
+    Requirement already satisfied: docker-pycreds>=0.4.0 in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from wandb) (0.4.0)
+    Requirement already satisfied: protobuf!=4.21.0,<5,>=3.19.0 in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from wandb) (3.20.3)
+    Requirement already satisfied: GitPython!=3.1.29,>=1.0.0 in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from wandb) (3.1.31)
+    Requirement already satisfied: psutil>=5.0.0 in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from wandb) (5.9.0)
+    Requirement already satisfied: appdirs>=1.4.3 in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from wandb) (1.4.4)
+    Requirement already satisfied: pathtools in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from wandb) (0.1.2)
+    Requirement already satisfied: setproctitle in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from wandb) (1.3.2)
+    Requirement already satisfied: sentry-sdk>=1.0.0 in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from wandb) (1.25.1)
+    Requirement already satisfied: requests<3,>=2.0.0 in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from wandb) (2.28.1)
+    Requirement already satisfied: setuptools in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from wandb) (65.5.0)
+    Requirement already satisfied: Click!=8.0.0,>=7.0 in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from wandb) (8.1.3)
+    Requirement already satisfied: PyYAML in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from wandb) (6.0)
+    Requirement already satisfied: typing-extensions in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from wandb) (4.3.0)
+    Requirement already satisfied: colorama in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from Click!=8.0.0,>=7.0->wandb) (0.4.5)
+    Requirement already satisfied: six>=1.4.0 in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)
+    Requirement already satisfied: gitdb<5,>=4.0.1 in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from GitPython!=3.1.29,>=1.0.0->wandb) (4.0.10)
+    Requirement already satisfied: idna<4,>=2.5 in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from requests<3,>=2.0.0->wandb) (3.4)
+    Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from requests<3,>=2.0.0->wandb) (1.26.12)
+    Requirement already satisfied: certifi>=2017.4.17 in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from requests<3,>=2.0.0->wandb) (2022.12.7)
+    Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from requests<3,>=2.0.0->wandb) (2.0.4)
+    Requirement already satisfied: smmap<6,>=3.0.1 in c:\users\uwu\miniconda3\envs\uni\lib\site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.0)
+
+True
+
 %% Cell type:markdown id: tags:

 Set Git Creds

 %% Cell type:code id: tags:

 ``` python
 !git config --global user.name "Ethan Aherne"
 !git config --global user.email "eca1g19@soton.ac.uk"
 !git config --get user.name
 !git config --get user.email
 ```

 %% Output

    Ethan Aherne
    eca1g19@soton.ac.uk

 %% Cell type:markdown id: tags:

 Imports

 %% Cell type:code id: tags:

 ``` python
 from sklearn.model_selection import KFold
 from torch.optim import AdamW
 from torch.nn import CrossEntropyLoss
 from torch.utils.data import DataLoader
 import scipy.stats as stats
 import evaluate
 from torch.utils.data import Dataset
 from transformers import BertTokenizerFast
 from datasets import load_dataset
 from tqdm import tqdm # This should be removed but in case it breaks everything here it will stay
 import torch.nn as nn
 import torch
 import warnings
 from tqdm.notebook import trange, tqdm
 import numpy as np
 from torch.utils.data import ConcatDataset
 import string_utils
 import train_utils
 from lightning_models import BertLightning, Seq2SeqLightning
 from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
 from lightning_fabric.loggers import CSVLogger
 import time
 import jupyter_beeper
 from base_models import BertSingleDense, BertDoubleDense, BertBiLSTM
 from pytorch_lightning import Trainer
 from pytorch_lightning.loggers import WandbLogger, wandb
 from sequence_to_sequence_models import EncoderDecoderBase, FrozenBertEncoder, SingleDenseBertDecoder, \
    DoubleDenseBertDecoder, BiLSTMBertDecoder
 ```

 %% Cell type:markdown id: tags:

 Output cuda/cpu

 %% Cell type:code id: tags:

 ``` python
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f'==============')
 print(f"RUNNING ON {device.upper()}")
 print(f'==============')
 ```

 %% Output

    ==============
    RUNNING ON CUDA
    ==============

 %% Cell type:markdown id: tags:

 Program Config

 %% Cell type:code id: tags:

 ``` python
 verbose = 1
 add_time_to_model_name = True
 ```

 %% Cell type:markdown id: tags:

 Training config

 %% Cell type:code id: tags:

 ``` python
 num_epochs = 8
 num_k_folds = 2  # For Cross-Validating to assess model performance
 batch_size = 64
 gradient_accumulation_steps = 1
 ```

 %% Cell type:markdown id: tags:

 # Dataset splits config
 Set percentages of each pre-split portion of the cnn database to use

 %% Cell type:code id: tags:

 ``` python
-train_split_percentage = 100  # percentage of SPLIT
-validate_split_percentage = 10
-test_split_percentage = 10
+train_split_percentage = 1  # percentage of SPLIT
+validate_split_percentage = 1
+test_split_percentage = 1
 ```

 %% Cell type:markdown id: tags:

 Lightning Config

 %% Cell type:code id: tags:

 ``` python
 use_lightning = True
 use_fp16 = True
 mixed_precision="16-mixed"
 ```

 %% Cell type:markdown id: tags:

 Print Configurations if verbose

 %% Cell type:code id: tags:

 ``` python
 def output_config():
    print(f"Program configuration:")
    print(f"Verbose Level: {verbose}")
    print(f"Adding time to model output: {add_time_to_model_name}\n")

    print(f"Dataset configuration:")
    print(f"Train Split Percentage: {train_split_percentage}")
    print(f"Validation Split Percentage: {validate_split_percentage}")
    print(f"Test Split Percentage: {test_split_percentage}\n")

    print(f"Training configuration:")
    print(f"Number of training epochs: {num_epochs}")
    print(f"Number of k-folds: {num_k_folds}")
    print(f"Batch size: {batch_size}")
    # TODO Implement
    #print(f"Gradient accumulation steps: {gradient_accumulation_steps}")
    #print(f"Effective Batch Size: {gradient_accumulation_steps * batch_size}\n")
    print(f"Mixed Precision: {mixed_precision}")

    print(f"Using Lightning: {use_lightning}")

 if verbose > 0:
  output_config()

    # TODO Implement
    #print(f"Gradient accumulation steps: {gradient_accumulation_steps}")
    #print(f"Effective Batch Size: {gradient_accumulation_steps * batch_size}\n")
 ```

 %% Output

    Program configuration:
    Verbose Level: 1
    Adding time to model output: True
    
    Dataset configuration:
-    Train Split Percentage: 100
-    Validation Split Percentage: 10
-    Test Split Percentage: 10
+    Train Split Percentage: 1
+    Validation Split Percentage: 1
+    Test Split Percentage: 1
    
    Training configuration:
    Number of training epochs: 8
    Number of k-folds: 2
    Batch size: 64
-    Mixed Precision: 16
+    Mixed Precision: 16-mixed
    Using Lightning: True

 %% Cell type:markdown id: tags:

 Tokenizer Init and config

 %% Cell type:code id: tags:

 ``` python
 tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
 ```

 %% Cell type:markdown id: tags:

 Custom CNN Dailymail Dataset Class

 %% Cell type:code id: tags:

 ``` python
 from cnn_dailymail_dataset import CNNDailyMailDataset
 ```

 %% Cell type:markdown id: tags:

 Bert Class Initialization

 %% Cell type:code id: tags:

 ``` python
 from base_models import BertSingleDense, BertDoubleDense, BertBiLSTM

 ```

 %% Cell type:markdown id: tags:

 Load Bert Tokenizer Fast

 %% Cell type:code id: tags:

 ``` python
 tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
 ```

 %% Cell type:markdown id: tags:

 Load Datasets

 %% Cell type:code id: tags:

 ``` python
 print(f"Dataset configuration:")
 print(f"Train Split Percentage: {train_split_percentage}")
 print(f"Validation Split Percentage: {validate_split_percentage}")
 print(f"Test Split Percentage: {test_split_percentage}\n")
 train_dataset = CNNDailyMailDataset(tokenizer=tokenizer, split_type='train', split_percentage=train_split_percentage, verbose=verbose)
 validation_dataset = CNNDailyMailDataset(tokenizer=tokenizer, split_type='validation',
                                         split_percentage=validate_split_percentage, verbose=verbose)
 test_dataset = CNNDailyMailDataset(tokenizer=tokenizer, split_type='test', split_percentage=test_split_percentage,
                                   verbose=verbose)
 ```

 %% Output

    Dataset configuration:
-    Train Split Percentage: 100
-    Validation Split Percentage: 10
-    Test Split Percentage: 10
+    Train Split Percentage: 1
+    Validation Split Percentage: 1
+    Test Split Percentage: 1
    
-    Loading cnn_dailymail dataset 3.0.0 with split type: train[:100%]
+    Loading cnn_dailymail dataset 3.0.0 with split type: train[:1%]
+    Loading cnn_dailymail dataset 3.0.0 with split type: validation[:1%]
+    Loading cnn_dailymail dataset 3.0.0 with split type: test[:1%]

    Found cached dataset cnn_dailymail (C:/Users/uwu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)
-
-    Loading cnn_dailymail dataset 3.0.0 with split type: validation[:10%]
-
    Found cached dataset cnn_dailymail (C:/Users/uwu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)
-
-    Loading cnn_dailymail dataset 3.0.0 with split type: test[:10%]
-
    Found cached dataset cnn_dailymail (C:/Users/uwu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)

 %% Cell type:markdown id: tags:

 Load Datasets into dataloaders

 %% Cell type:code id: tags:

 ``` python
 train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
 validation_loader = DataLoader(validation_dataset, batch_size=batch_size)
 test_loader = DataLoader(test_dataset, batch_size=batch_size)
 ```

 %% Cell type:markdown id: tags:

 Define Model Object and config

 %% Cell type:code id: tags:

 ``` python
 ```

 %% Cell type:markdown id: tags:

 Define KFold Object, set to None if not cross validating

 %% Cell type:code id: tags:

 ``` python
 cross_validation_k_folder = KFold(n_splits=num_k_folds) if num_k_folds > 0 else None
 ```

 %% Cell type:markdown id: tags:

 Define loss function object

 %% Cell type:code id: tags:

 ``` python
 criterion = CrossEntropyLoss()
 ```

 %% Cell type:markdown id: tags:

 Load Rouge scorer

 %% Cell type:code id: tags:

 ``` python
 rouge_score = evaluate.load("rouge")
 ```

 %% Cell type:markdown id: tags:

 ## Analyze Model scores
 input is list of dictionary of scores on each

 %% Cell type:markdown id: tags:

 Train the model

 %% Cell type:code id: tags:

 ``` python
 output_config()

 num_cpus = os.cpu_count()
 train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_cpus)
 validation_loader = DataLoader(validation_dataset, batch_size=batch_size, num_workers=num_cpus)


 b = jupyter_beeper.Beeper()
 b.beep()
 tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

 # Load Datasets
 train_dataset = CNNDailyMailDataset(tokenizer=tokenizer, split_type='train', split_percentage=train_split_percentage,
                                    verbose=verbose)
 validation_dataset = CNNDailyMailDataset(tokenizer=tokenizer, split_type='validation',
                                         split_percentage=validate_split_percentage, verbose=verbose)
 test_dataset = CNNDailyMailDataset(tokenizer=tokenizer, split_type='test', split_percentage=test_split_percentage,
                                   verbose=verbose)
 encoder = FrozenBertEncoder()

 # Define Model Object
 decoders = [SingleDenseBertDecoder(),DoubleDenseBertDecoder(), BiLSTMBertDecoder()]
 # Define KFold Object, set to None if not cross validating
 cross_validation_k_fold = KFold(n_splits=num_k_folds) if num_k_folds > 0 else None
 # Define Optimizer (AdamW) - Filters to only optimize params that are not frozen (i.e. not bert)
 # Define loss function object
 criterion = nn.NLLLoss()

 num_cpus = os.cpu_count()
 num_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]

-if num_gpus>= 8:
+if len(num_gpus)>= 8:
    print("POWAAAAAA")
    strategy = "ddp_notebook"
 else:
    strategy = None


 # Load Datasets into data-loaders
 test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_cpus)
 train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_cpus)
 val_loader = DataLoader(validation_dataset, batch_size=batch_size, num_workers=num_cpus)

 output_config()
-wandb.login()

 b = jupyter_beeper.Beeper()
 b.beep()


 torch.set_float32_matmul_precision("high")

 if cross_validation_k_folder is not None:
    for fold, (train_idx, val_idx) in enumerate(cross_validation_k_folder.split(train_dataset)):
        train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
        val_sampler = torch.utils.data.SubsetRandomSampler(train_idx)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, num_workers=num_cpus)
        val_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=val_sampler, num_workers=num_cpus)

        encoder = FrozenBertEncoder()
        for decoder in decoders:
            model = EncoderDecoderBase(encoder=encoder, decoder=decoder)
            model = Seq2SeqLightning(model)
            wandb_logger = WandbLogger(name=f"seq2seq_lightning_fold_{fold}", project="seq2seq_lightning")
            checkpoint_callback = ModelCheckpoint(
                monitor='val_loss',
                dirpath=f'checkpoints_fold_{fold}',
                filename='seq2seq-{epoch:02d}-{val_loss:.2f}',
                save_top_k=3,
                mode='min',
            )
            early_stopping = EarlyStopping(monitor='val_loss', patience=3, mode='min')
            if strategy is not None:
                trainer = Trainer(
                    max_epochs=num_epochs,
                    accelerator="auto",
                    devices=len(num_gpus),
                    precision=mixed_precision if use_fp16 else 32,
                    logger=wandb_logger,
                    callbacks=[checkpoint_callback, early_stopping],
                    strategy=strategy
                )
            else:
                trainer = Trainer(
                    max_epochs=num_epochs,
                    accelerator="auto",
                    devices=len(num_gpus),
                    precision=mixed_precision if use_fp16 else 32,
                    logger=wandb_logger,
                    callbacks=[checkpoint_callback, early_stopping],
                )
            trainer.fit(model, train_loader, val_loader)
 else:
    for decoder in decoders:
        model = EncoderDecoderBase(encoder=encoder, decoder=decoder)
        model = Seq2SeqLightning(model)
        wandb_logger = WandbLogger(name="seq2seq_lightning_run", project="seq2seq_lightning")

        checkpoint_callback = ModelCheckpoint(
            monitor='val_loss',
            dirpath='checkpoints',
            filename='seq2seq-{epoch:02d}-{val_loss:.2f}',
            save_top_k=3,
            mode='min',
        )

        early_stopping = EarlyStopping(monitor='val_loss', patience=3, mode='min')
        if strategy is not None:
            trainer = Trainer(
                max_epochs=num_epochs,
                accelerator="auto",
                devices=len(num_gpus),
                precision=mixed_precision if use_fp16 else 32,
                logger=wandb_logger,
                callbacks=[checkpoint_callback, early_stopping],
                strategy=strategy
            )
        else:
            trainer = Trainer(
                max_epochs=num_epochs,
                accelerator="auto",
                devices=len(num_gpus),
                precision=mixed_precision if use_fp16 else 32,
                logger=wandb_logger,
                callbacks=[checkpoint_callback, early_stopping],
            )
        trainer.fit(model, train_loader, val_loader)
        b.beep()
 wandb.finish()
 ```

 %% Output

    Program configuration:
    Verbose Level: 1
    Adding time to model output: True
    
    Dataset configuration:
-    Train Split Percentage: 100
-    Validation Split Percentage: 10
-    Test Split Percentage: 10
+    Train Split Percentage: 1
+    Validation Split Percentage: 1
+    Test Split Percentage: 1
    
    Training configuration:
    Number of training epochs: 8
    Number of k-folds: 2
    Batch size: 64
-    Mixed Precision: 16
+    Mixed Precision: 16-mixed
    Using Lightning: True
+    Loading cnn_dailymail dataset 3.0.0 with split type: train[:1%]
+    Loading cnn_dailymail dataset 3.0.0 with split type: validation[:1%]
+    Loading cnn_dailymail dataset 3.0.0 with split type: test[:1%]
+    Pad token is: 0
+    Pad token is: 0
+    Pad token is: 0
+    Pad token is: 0
+    Program configuration:
+    Verbose Level: 1
+    Adding time to model output: True
+    
+    Dataset configuration:
+    Train Split Percentage: 1
+    Validation Split Percentage: 1
+    Test Split Percentage: 1
+    
+    Training configuration:
+    Number of training epochs: 8
+    Number of k-folds: 2
+    Batch size: 64
+    Mixed Precision: 16-mixed
+    Using Lightning: True
+


-    Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
+    Found cached dataset cnn_dailymail (C:/Users/uwu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)
+    Found cached dataset cnn_dailymail (C:/Users/uwu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)
+    Found cached dataset cnn_dailymail (C:/Users/uwu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)
+    Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
    - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
    - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
-    Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
+    Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
    - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
    - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
-    Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
+    Some weights of BertModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.6.crossattention.self.key.bias', 'bert.encoder.layer.10.crossattention.output.dense.bias', 'bert.encoder.layer.2.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.5.crossattention.self.query.weight', 'bert.encoder.layer.4.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.2.crossattention.self.key.bias', 'bert.encoder.layer.6.crossattention.output.dense.weight', 'bert.encoder.layer.10.crossattention.self.value.bias', 'bert.encoder.layer.11.crossattention.self.value.bias', 'bert.encoder.layer.8.crossattention.self.value.bias', 'bert.encoder.layer.4.crossattention.self.key.bias', 'bert.encoder.layer.7.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.5.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.11.crossattention.output.dense.weight', 'bert.encoder.layer.7.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.4.crossattention.self.query.bias', 'bert.encoder.layer.9.crossattention.output.dense.bias', 'bert.encoder.layer.2.crossattention.output.dense.weight', 'bert.encoder.layer.4.crossattention.self.query.weight', 'bert.encoder.layer.10.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.5.crossattention.self.key.weight', 'bert.encoder.layer.7.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.9.crossattention.self.key.bias', 'bert.encoder.layer.11.crossattention.self.value.weight', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.3.crossattention.self.query.bias', 'bert.encoder.layer.10.crossattention.self.value.weight', 'bert.encoder.layer.3.crossattention.self.key.bias', 'bert.encoder.layer.9.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.2.crossattention.self.query.bias', 'bert.encoder.layer.10.crossattention.self.key.bias', 'bert.encoder.layer.3.crossattention.self.value.bias', 'bert.encoder.layer.11.crossattention.self.query.bias', 'bert.encoder.layer.3.crossattention.self.key.weight', 'bert.encoder.layer.5.crossattention.self.key.bias', 'bert.encoder.layer.9.crossattention.self.value.weight', 'bert.encoder.layer.10.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.9.crossattention.self.value.bias', 'bert.encoder.layer.3.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.9.crossattention.self.key.weight', 'bert.encoder.layer.6.crossattention.self.value.bias', 'bert.encoder.layer.9.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.9.crossattention.output.dense.weight', 'bert.encoder.layer.8.crossattention.self.query.weight', 'bert.encoder.layer.1.crossattention.self.query.weight', 'bert.encoder.layer.10.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.3.crossattention.output.dense.weight', 'bert.encoder.layer.4.crossattention.self.key.weight', 'bert.encoder.layer.4.crossattention.output.dense.bias', 'bert.encoder.layer.10.crossattention.output.dense.weight', 'bert.encoder.layer.10.crossattention.self.key.weight', 'bert.encoder.layer.8.crossattention.self.key.weight', 'bert.encoder.layer.9.crossattention.self.query.bias', 'bert.encoder.layer.5.crossattention.self.query.bias', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.8.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.value.bias', 'bert.encoder.layer.2.crossattention.output.dense.bias', 'bert.encoder.layer.11.crossattention.self.key.bias', 'bert.encoder.layer.7.crossattention.self.key.weight', 'bert.encoder.layer.11.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.2.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.3.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.8.crossattention.self.key.bias', 'bert.encoder.layer.6.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.1.crossattention.self.key.weight', 'bert.encoder.layer.10.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.5.crossattention.output.dense.bias', 'bert.encoder.layer.3.crossattention.output.dense.bias', 'bert.encoder.layer.8.crossattention.output.dense.bias', 'bert.encoder.layer.6.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.6.crossattention.self.query.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.4.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.7.crossattention.self.query.weight', 'bert.encoder.layer.6.crossattention.self.query.bias', 'bert.encoder.layer.11.crossattention.output.dense.bias', 'bert.encoder.layer.3.crossattention.self.query.weight', 'bert.encoder.layer.8.crossattention.self.value.weight', 'bert.encoder.layer.9.crossattention.self.query.weight', 'bert.encoder.layer.8.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.4.crossattention.self.value.weight', 'bert.encoder.layer.4.crossattention.output.dense.weight', 'bert.encoder.layer.2.crossattention.self.key.weight', 'bert.encoder.layer.5.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.self.query.bias', 'bert.encoder.layer.2.crossattention.self.value.weight', 'bert.encoder.layer.7.crossattention.self.value.weight', 'bert.encoder.layer.8.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.11.crossattention.self.query.weight', 'bert.encoder.layer.11.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.7.crossattention.output.dense.weight', 'bert.encoder.layer.2.crossattention.self.value.bias', 'bert.encoder.layer.4.crossattention.self.value.bias', 'bert.encoder.layer.5.crossattention.self.value.weight', 'bert.encoder.layer.7.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.6.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.2.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.7.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.11.crossattention.self.key.weight', 'bert.encoder.layer.5.crossattention.self.value.bias', 'bert.encoder.layer.7.crossattention.self.query.bias', 'bert.encoder.layer.6.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.self.value.weight', 'bert.encoder.layer.8.crossattention.self.query.bias', 'bert.encoder.layer.6.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.3.crossattention.self.value.weight', 'bert.encoder.layer.5.crossattention.output.dense.weight']
+    You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+    Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
    - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
    - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+    Some weights of BertModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.6.crossattention.self.key.bias', 'bert.encoder.layer.10.crossattention.output.dense.bias', 'bert.encoder.layer.2.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.5.crossattention.self.query.weight', 'bert.encoder.layer.4.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.2.crossattention.self.key.bias', 'bert.encoder.layer.6.crossattention.output.dense.weight', 'bert.encoder.layer.10.crossattention.self.value.bias', 'bert.encoder.layer.11.crossattention.self.value.bias', 'bert.encoder.layer.8.crossattention.self.value.bias', 'bert.encoder.layer.4.crossattention.self.key.bias', 'bert.encoder.layer.7.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.5.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.11.crossattention.output.dense.weight', 'bert.encoder.layer.7.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.4.crossattention.self.query.bias', 'bert.encoder.layer.9.crossattention.output.dense.bias', 'bert.encoder.layer.2.crossattention.output.dense.weight', 'bert.encoder.layer.4.crossattention.self.query.weight', 'bert.encoder.layer.10.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.5.crossattention.self.key.weight', 'bert.encoder.layer.7.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.9.crossattention.self.key.bias', 'bert.encoder.layer.11.crossattention.self.value.weight', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.3.crossattention.self.query.bias', 'bert.encoder.layer.10.crossattention.self.value.weight', 'bert.encoder.layer.3.crossattention.self.key.bias', 'bert.encoder.layer.9.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.2.crossattention.self.query.bias', 'bert.encoder.layer.10.crossattention.self.key.bias', 'bert.encoder.layer.3.crossattention.self.value.bias', 'bert.encoder.layer.11.crossattention.self.query.bias', 'bert.encoder.layer.3.crossattention.self.key.weight', 'bert.encoder.layer.5.crossattention.self.key.bias', 'bert.encoder.layer.9.crossattention.self.value.weight', 'bert.encoder.layer.10.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.9.crossattention.self.value.bias', 'bert.encoder.layer.3.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.9.crossattention.self.key.weight', 'bert.encoder.layer.6.crossattention.self.value.bias', 'bert.encoder.layer.9.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.9.crossattention.output.dense.weight', 'bert.encoder.layer.8.crossattention.self.query.weight', 'bert.encoder.layer.1.crossattention.self.query.weight', 'bert.encoder.layer.10.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.3.crossattention.output.dense.weight', 'bert.encoder.layer.4.crossattention.self.key.weight', 'bert.encoder.layer.4.crossattention.output.dense.bias', 'bert.encoder.layer.10.crossattention.output.dense.weight', 'bert.encoder.layer.10.crossattention.self.key.weight', 'bert.encoder.layer.8.crossattention.self.key.weight', 'bert.encoder.layer.9.crossattention.self.query.bias', 'bert.encoder.layer.5.crossattention.self.query.bias', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.8.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.value.bias', 'bert.encoder.layer.2.crossattention.output.dense.bias', 'bert.encoder.layer.11.crossattention.self.key.bias', 'bert.encoder.layer.7.crossattention.self.key.weight', 'bert.encoder.layer.11.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.2.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.3.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.8.crossattention.self.key.bias', 'bert.encoder.layer.6.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.1.crossattention.self.key.weight', 'bert.encoder.layer.10.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.5.crossattention.output.dense.bias', 'bert.encoder.layer.3.crossattention.output.dense.bias', 'bert.encoder.layer.8.crossattention.output.dense.bias', 'bert.encoder.layer.6.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.6.crossattention.self.query.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.4.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.7.crossattention.self.query.weight', 'bert.encoder.layer.6.crossattention.self.query.bias', 'bert.encoder.layer.11.crossattention.output.dense.bias', 'bert.encoder.layer.3.crossattention.self.query.weight', 'bert.encoder.layer.8.crossattention.self.value.weight', 'bert.encoder.layer.9.crossattention.self.query.weight', 'bert.encoder.layer.8.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.4.crossattention.self.value.weight', 'bert.encoder.layer.4.crossattention.output.dense.weight', 'bert.encoder.layer.2.crossattention.self.key.weight', 'bert.encoder.layer.5.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.self.query.bias', 'bert.encoder.layer.2.crossattention.self.value.weight', 'bert.encoder.layer.7.crossattention.self.value.weight', 'bert.encoder.layer.8.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.11.crossattention.self.query.weight', 'bert.encoder.layer.11.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.7.crossattention.output.dense.weight', 'bert.encoder.layer.2.crossattention.self.value.bias', 'bert.encoder.layer.4.crossattention.self.value.bias', 'bert.encoder.layer.5.crossattention.self.value.weight', 'bert.encoder.layer.7.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.6.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.2.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.7.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.11.crossattention.self.key.weight', 'bert.encoder.layer.5.crossattention.self.value.bias', 'bert.encoder.layer.7.crossattention.self.query.bias', 'bert.encoder.layer.6.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.self.value.weight', 'bert.encoder.layer.8.crossattention.self.query.bias', 'bert.encoder.layer.6.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.3.crossattention.self.value.weight', 'bert.encoder.layer.5.crossattention.output.dense.weight']
+    You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+    Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
+    - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+    - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+    Some weights of BertModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.6.crossattention.self.key.bias', 'bert.encoder.layer.10.crossattention.output.dense.bias', 'bert.encoder.layer.2.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.5.crossattention.self.query.weight', 'bert.encoder.layer.4.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.2.crossattention.self.key.bias', 'bert.encoder.layer.6.crossattention.output.dense.weight', 'bert.encoder.layer.10.crossattention.self.value.bias', 'bert.encoder.layer.11.crossattention.self.value.bias', 'bert.encoder.layer.8.crossattention.self.value.bias', 'bert.encoder.layer.4.crossattention.self.key.bias', 'bert.encoder.layer.7.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.5.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.11.crossattention.output.dense.weight', 'bert.encoder.layer.7.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.4.crossattention.self.query.bias', 'bert.encoder.layer.9.crossattention.output.dense.bias', 'bert.encoder.layer.2.crossattention.output.dense.weight', 'bert.encoder.layer.4.crossattention.self.query.weight', 'bert.encoder.layer.10.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.5.crossattention.self.key.weight', 'bert.encoder.layer.7.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.9.crossattention.self.key.bias', 'bert.encoder.layer.11.crossattention.self.value.weight', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.3.crossattention.self.query.bias', 'bert.encoder.layer.10.crossattention.self.value.weight', 'bert.encoder.layer.3.crossattention.self.key.bias', 'bert.encoder.layer.9.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.2.crossattention.self.query.bias', 'bert.encoder.layer.10.crossattention.self.key.bias', 'bert.encoder.layer.3.crossattention.self.value.bias', 'bert.encoder.layer.11.crossattention.self.query.bias', 'bert.encoder.layer.3.crossattention.self.key.weight', 'bert.encoder.layer.5.crossattention.self.key.bias', 'bert.encoder.layer.9.crossattention.self.value.weight', 'bert.encoder.layer.10.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.9.crossattention.self.value.bias', 'bert.encoder.layer.3.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.9.crossattention.self.key.weight', 'bert.encoder.layer.6.crossattention.self.value.bias', 'bert.encoder.layer.9.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.9.crossattention.output.dense.weight', 'bert.encoder.layer.8.crossattention.self.query.weight', 'bert.encoder.layer.1.crossattention.self.query.weight', 'bert.encoder.layer.10.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.3.crossattention.output.dense.weight', 'bert.encoder.layer.4.crossattention.self.key.weight', 'bert.encoder.layer.4.crossattention.output.dense.bias', 'bert.encoder.layer.10.crossattention.output.dense.weight', 'bert.encoder.layer.10.crossattention.self.key.weight', 'bert.encoder.layer.8.crossattention.self.key.weight', 'bert.encoder.layer.9.crossattention.self.query.bias', 'bert.encoder.layer.5.crossattention.self.query.bias', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.8.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.value.bias', 'bert.encoder.layer.2.crossattention.output.dense.bias', 'bert.encoder.layer.11.crossattention.self.key.bias', 'bert.encoder.layer.7.crossattention.self.key.weight', 'bert.encoder.layer.11.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.2.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.3.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.8.crossattention.self.key.bias', 'bert.encoder.layer.6.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.1.crossattention.self.key.weight', 'bert.encoder.layer.10.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.5.crossattention.output.dense.bias', 'bert.encoder.layer.3.crossattention.output.dense.bias', 'bert.encoder.layer.8.crossattention.output.dense.bias', 'bert.encoder.layer.6.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.6.crossattention.self.query.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.4.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.7.crossattention.self.query.weight', 'bert.encoder.layer.6.crossattention.self.query.bias', 'bert.encoder.layer.11.crossattention.output.dense.bias', 'bert.encoder.layer.3.crossattention.self.query.weight', 'bert.encoder.layer.8.crossattention.self.value.weight', 'bert.encoder.layer.9.crossattention.self.query.weight', 'bert.encoder.layer.8.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.4.crossattention.self.value.weight', 'bert.encoder.layer.4.crossattention.output.dense.weight', 'bert.encoder.layer.2.crossattention.self.key.weight', 'bert.encoder.layer.5.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.self.query.bias', 'bert.encoder.layer.2.crossattention.self.value.weight', 'bert.encoder.layer.7.crossattention.self.value.weight', 'bert.encoder.layer.8.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.11.crossattention.self.query.weight', 'bert.encoder.layer.11.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.7.crossattention.output.dense.weight', 'bert.encoder.layer.2.crossattention.self.value.bias', 'bert.encoder.layer.4.crossattention.self.value.bias', 'bert.encoder.layer.5.crossattention.self.value.weight', 'bert.encoder.layer.7.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.6.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.2.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.7.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.11.crossattention.self.key.weight', 'bert.encoder.layer.5.crossattention.self.value.bias', 'bert.encoder.layer.7.crossattention.self.query.bias', 'bert.encoder.layer.6.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.self.value.weight', 'bert.encoder.layer.8.crossattention.self.query.bias', 'bert.encoder.layer.6.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.3.crossattention.self.value.weight', 'bert.encoder.layer.5.crossattention.output.dense.weight']
+    You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

-    Training BertBiLSTM
-    Available GPUs: 1
-
-    C:\Users\uwu\miniconda3\envs\uni\lib\site-packages\lightning_fabric\connector.py:555: UserWarning: 16 is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
-      rank_zero_warn(
-    Using 16bit Automatic Mixed Precision (AMP)
-    GPU available: True (cuda), used: True
-    TPU available: False, using: 0 TPU cores
-    IPU available: False, using: 0 IPUs
-    HPU available: False, using: 0 HPUs
-    C:\Users\uwu\miniconda3\envs\uni\lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:67: UserWarning: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
-      warning_cache.warn(
-    C:\Users\uwu\miniconda3\envs\uni\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:615: UserWarning: Checkpoint directory C:\Users\uwu\PycharmProjects\COMP3200\Models exists and is not empty.
-      rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
-    LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
-    
-      | Name      | Type       | Params
-    -----------------------------------------
-    0 | model     | BertBiLSTM | 165 M
-    1 | criterion | NLLLoss    | 0
-    -----------------------------------------
-    56.4 M    Trainable params
-    109 M     Non-trainable params
-    165 M     Total params
-    663.376   Total estimated model params size (MB)
-
-
-    C:\Users\uwu\miniconda3\envs\uni\lib\site-packages\pytorch_lightning\trainer\call.py:52: UserWarning: Detected KeyboardInterrupt, attempting graceful shutdown...
-      rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
-    Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x0000020B93049EE0>
-    Traceback (most recent call last):
-      File "C:\Users\uwu\miniconda3\envs\uni\lib\site-packages\torch\utils\data\dataloader.py", line 1478, in __del__
-        self._shutdown_workers()
-      File "C:\Users\uwu\miniconda3\envs\uni\lib\site-packages\torch\utils\data\dataloader.py", line 1436, in _shutdown_workers
-        if self._persistent_workers or self._workers_status[worker_id]:
-    AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status'
-    C:\Users\uwu\miniconda3\envs\uni\lib\site-packages\lightning_fabric\connector.py:555: UserWarning: 16 is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
-      rank_zero_warn(
-
-    Training BertDoubleDense
-    Available GPUs: 1
-
-    Using 16bit Automatic Mixed Precision (AMP)
-    GPU available: True (cuda), used: True
-    TPU available: False, using: 0 TPU cores
-    IPU available: False, using: 0 IPUs
-    HPU available: False, using: 0 HPUs
-    LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
-    
-      | Name      | Type            | Params
-    ----------------------------------------------
-    0 | model     | BertDoubleDense | 133 M
-    1 | criterion | NLLLoss         | 0
-    ----------------------------------------------
-    24.1 M    Trainable params
-    109 M     Non-trainable params
-    133 M     Total params
-    534.177   Total estimated model params size (MB)
-
+    ---------------------------------------------------------------------------
+    AttributeError                            Traceback (most recent call last)
+Cell     In [63], line 45
+         42 val_loader = DataLoader(validation_dataset, batch_size=batch_size, num_workers=num_cpus)
+         44 output_config()
+    ---> 45 wandb.login()
+         47 b = jupyter_beeper.Beeper()
+         48 b.beep()
+    AttributeError: module 'pytorch_lightning.loggers.wandb' has no attribute 'login'

 %% Cell type:markdown id: tags:

 Load a model from a checkpoint (debugging from here):

 %% Cell type:code id: tags:

 ``` python
 model = BertLightning(BertSingleDense())
 test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_cpus)
 batch_size = 8 # I am a mortal on local machine
 trainer = Trainer(devices=len(num_gpus),
            accelerator="auto",
            precision="16",
            load_from_checkpoint="Models/epoch=7-val_loss=0.86-rouge=0.00.ckpt")
 trainer.test(model, test_loader)
 ```

-%% Output
-
-      Cell In [7], line 6
-        precision="16",load_from_checkpoint("Models/epoch=7-val_loss=0.86-rouge=0.00.ckpt"))
-                                                                                           ^
-    SyntaxError: positional argument follows keyword argument
-
 %% Cell type:code id: tags:

 ``` python
 def evaluate_model_and_debug(model, data_loader, scorer, tokenizer, device='cpu'):
    # Set model to eval mode
    model.eval()

    # Init score tracker
    score_totals = []

    # Disable gradients for evaluation - performance
    with torch.no_grad():
        # Init tqdm
        desc_string = "Evaluation"
        progress_bar = tqdm(data_loader, desc=desc_string)

        # Iterate over each batch
        for batch_id, batch in enumerate(progress_bar):
            # Unpack batch into inputs and outputs
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Run Forward Pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Greedy decode for now
            max_values, predicted_indices = torch.max(outputs, dim=-1)
            predicted_tokens = [[tokenizer.convert_ids_to_tokens(idx.item()) for idx in seq] for seq in predicted_indices]


            # Argmax Decode
            outputs_decoded = [tokenizer.decode(o, skip_special_tokens=True) for o in predicted_indices]
            labels_decoded = [tokenizer.decode(l, skip_special_tokens=True) for l in labels]

            print(f"outputs shape: {outputs.shape}")
            print(f"labels shape: {labels.shape}")
            print(f"predicted_indices shape: {predicted_indices.shape}")
            print(f"predicted_indices[0] shape: {predicted_indices[0].shape}")


            # Calculate performance score
            score = scorer.compute(predictions=outputs_decoded, references=labels_decoded)
            score_totals.append(score)

            # Updates progress bar text
            progress_bar.set_postfix({f'Batch {batch_id} Score': score})

    # Return scores list of dictionaries
    return score_totals
 ```

 %% Cell type:code id: tags:

 ``` python
 model = BertSingleDense().to(device)
 checkpoint_path = "/content/Models/BertSingleDense/BertSingleDense_best.pt"
 checkpoint_dir = os.path.dirname(checkpoint_path)
 model.load_state_dict(torch.load(checkpoint_path))

 tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
 rouge_score = evaluate.load("rouge")
 validation_dataset = CNNDailyMailDataset(tokenizer=tokenizer, split_type='validation',
                                         split_percentage=1, verbose=verbose)
 validation_data_loader = DataLoader(validation_dataset, batch_size=16)

 model_evaluate_scores = evaluate_model_and_debug(model, validation_data_loader,
                                       rouge_score, tokenizer, device=device)

 print(model_evaluate_scores)
 ```