Changed from 16 to 16-mixed fpp

73e99f97 · eca1g19 · 62981f71 · 73e99f97
Commit 73e99f97 authored Jun 16, 2023 by eca1g19
--- a/main_lightning.ipynb
+++ b/main_lightning.ipynb
@@ -513,7 +513,7 @@
   "source": [
    "use_lightning = True\n",
    "use_fp16 = True\n",
-    "mixed_precision=16"
+    "mixed_precision=\"16-mixed\""
   ]
  },
  {

 %% Cell type:markdown id: tags:

 Memory Check

 %% Cell type:code id: tags:

 ``` python
 !nvidia-smi
 ```

 %% Output

    Wed Jun 14 17:04:39 2023
    +---------------------------------------------------------------------------------------+
    | NVIDIA-SMI 531.29                 Driver Version: 531.29       CUDA Version: 12.1     |
    |-----------------------------------------+----------------------+----------------------+
    | GPU  Name                      TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
    | Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
    |                                         |                      |               MIG M. |
    |=========================================+======================+======================|
    |   0  NVIDIA GeForce RTX 2080 Ti    WDDM | 00000000:0E:00.0  On |                  N/A |
    | 41%   49C    P8               43W / 260W|   2505MiB / 11264MiB |     17%      Default |
    |                                         |                      |                  N/A |
    +-----------------------------------------+----------------------+----------------------+
    
    +---------------------------------------------------------------------------------------+
    | Processes:                                                                            |
    |  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
    |        ID   ID                                                             Usage      |
    |=======================================================================================|
    |    0   N/A  N/A      2412    C+G   ...inaries\Win64\EpicGamesLauncher.exe    N/A      |
    |    0   N/A  N/A      3144    C+G   ...a\Local\Mozilla Firefox\firefox.exe    N/A      |
    |    0   N/A  N/A      3400    C+G   ..._x64__kzf8qxf38zg5c\Skype\Skype.exe    N/A      |
    |    0   N/A  N/A      3752    C+G   ...GeForce Experience\NVIDIA Share.exe    N/A      |
    |    0   N/A  N/A      4240    C+G   ...1.0_x64__8wekyb3d8bbwe\Video.UI.exe    N/A      |
    |    0   N/A  N/A      6468    C+G   ....Search_cw5n1h2txyewy\SearchApp.exe    N/A      |
    |    0   N/A  N/A      6828    C+G   ...rm 2020.3.3\jbr\bin\jcef_helper.exe    N/A      |
    |    0   N/A  N/A      9500    C+G   ....0_x64__8wekyb3d8bbwe\HxOutlook.exe    N/A      |
    |    0   N/A  N/A      9780    C+G   ..._x64__kzf8qxf38zg5c\Skype\Skype.exe    N/A      |
    |    0   N/A  N/A     11628    C+G   C:\Windows\explorer.exe                   N/A      |
    |    0   N/A  N/A     12416    C+G   ...2txyewy\StartMenuExperienceHost.exe    N/A      |
    |    0   N/A  N/A     14040    C+G   ...302.5.0_x64__8wekyb3d8bbwe\Time.exe    N/A      |
    |    0   N/A  N/A     14792    C+G   ...GeForce Experience\NVIDIA Share.exe    N/A      |
    |    0   N/A  N/A     16016    C+G   ...CBS_cw5n1h2txyewy\TextInputHost.exe    N/A      |
    |    0   N/A  N/A     16612    C+G   ...ft Office\root\Office16\OUTLOOK.EXE    N/A      |
    |    0   N/A  N/A     17024    C+G   ....Search_cw5n1h2txyewy\SearchApp.exe    N/A      |
    |    0   N/A  N/A     17124    C+G   ...oogle\Chrome\Application\chrome.exe    N/A      |
    |    0   N/A  N/A     17368    C+G   ...l\Microsoft\Teams\current\Teams.exe    N/A      |
    |    0   N/A  N/A     20412    C+G   ...on\114.0.1823.43\msedgewebview2.exe    N/A      |
    |    0   N/A  N/A     20660    C+G   ...air\Corsair iCUE5 Software\iCUE.exe    N/A      |
    |    0   N/A  N/A     23044    C+G   ...\cef\cef.win7x64\steamwebhelper.exe    N/A      |
    |    0   N/A  N/A     23360    C+G   ...Canary\app-1.0.66\DiscordCanary.exe    N/A      |
    |    0   N/A  N/A     24680    C+G   ...ne\Binaries\Win64\EpicWebHelper.exe    N/A      |
    |    0   N/A  N/A     25200    C+G   ...on\wallpaper_engine\wallpaper32.exe    N/A      |
    |    0   N/A  N/A     25596    C+G   ...e Stream\76.0.3.0\GoogleDriveFS.exe    N/A      |
    |    0   N/A  N/A     25952    C+G   ..._8wekyb3d8bbwe\Microsoft.Photos.exe    N/A      |
    |    0   N/A  N/A     26716    C+G   C:\Program Files\RaiderIO\RaiderIO.exe    N/A      |
    |    0   N/A  N/A     27700    C+G   ...les (x86)\Overwolf\old_Overwolf.exe    N/A      |
    |    0   N/A  N/A     28444    C+G   ...cordPTB\app-1.0.1027\DiscordPTB.exe    N/A      |
    |    0   N/A  N/A     29192    C+G   ...les (x86)\Battle.net\Battle.net.exe    N/A      |
    |    0   N/A  N/A     31192    C+G   ...wolf\0.223.0.33\OverwolfBrowser.exe    N/A      |
    |    0   N/A  N/A     31576    C+G   C:\Program Files\NordVPN\NordVPN.exe      N/A      |
    |    0   N/A  N/A     31956    C+G   ...ekyb3d8bbwe\PhoneExperienceHost.exe    N/A      |
    |    0   N/A  N/A     32976    C+G   ...ft Office\root\Office16\WINWORD.EXE    N/A      |
    |    0   N/A  N/A     34400    C+G   ...02.0_x86__zpdnekdrzrea0\Spotify.exe    N/A      |
    |    0   N/A  N/A     34932    C+G   ...ft Office\root\Office16\ONENOTE.EXE    N/A      |
    |    0   N/A  N/A     34944    C+G   ...5n1h2txyewy\ShellExperienceHost.exe    N/A      |
    |    0   N/A  N/A     37420    C+G   ...l\Microsoft\Teams\current\Teams.exe    N/A      |
    |    0   N/A  N/A     37968    C+G   ...al\Discord\app-1.0.9013\Discord.exe    N/A      |
    |    0   N/A  N/A     38508    C+G   ...t.LockApp_cw5n1h2txyewy\LockApp.exe    N/A      |
    |    0   N/A  N/A     42416    C+G   ...ft Office\root\Office16\WINWORD.EXE    N/A      |
    |    0   N/A  N/A     42952    C+G   ...crosoft\Edge\Application\msedge.exe    N/A      |
    |    0   N/A  N/A     44812    C+G   ...cal\Microsoft\OneDrive\OneDrive.exe    N/A      |
    |    0   N/A  N/A     47144    C+G   ...a\Local\Mozilla Firefox\firefox.exe    N/A      |
    |    0   N/A  N/A     47776    C+G   ...siveControlPanel\SystemSettings.exe    N/A      |
    |    0   N/A  N/A     49192    C+G   ...0_x64__8wekyb3d8bbwe\HxAccounts.exe    N/A      |
    |    0   N/A  N/A     49296    C+G   ...sair iCUE5 Software\QmlRenderer.exe    N/A      |
    +---------------------------------------------------------------------------------------+

 %% Cell type:markdown id: tags:

 Pip Installs

 %% Cell type:code id: tags:

 ``` python
 #should be handled by requirements.txt - but isnt
 import os
 package_install_override = False
 if False and not os.path.exists("installedRepos") or package_install_override:
    !pip install absl-py
    !pip install aiohttp
    !pip install aiosignal
    !pip install async-timeout
    !pip install cachetools
    !pip install certifi
    !pip install click
    !pip install datasets
    !pip install dill
    !pip install evaluate
    !pip install filelock
    !pip install fonttools
    !pip install frozenlist
    !pip install fsspec
    !pip install google-api-core
    !pip install google-api-python-client
    !pip install google-auth
    !pip install google-auth-httplib2
    !pip install googleapis-common-protos
    !pip install httplib2
    !pip install huggingface-hub
    !pip install ipython-genutils
    !pip install joblib
    !pip install Jupyter-Beeper
    !pip install lightning-utilities
    !pip install mkl-fft
    !pip install mkl-random
    !pip install mkl-service
    !pip install mpmath
    !pip install multidict
    !pip install multiprocess
    !pip install munkres
    !pip install networkx
    !pip install nltk
    !pip install oauth2client
    !pip install pandas
    !pip install Pillow
    !pip install ply
    !pip install protobuf
    !pip install pyarrow
    !pip install pyasn1
    !pip install pyasn1-modules
    !pip install PyDrive
    !pip install pyenchant
    !pip install PyQt5
    !pip install pytorch-beam-search
    !pip install pytorch-lightning
    !pip install pywin32
    !pip install PyYAML
    !pip install pyzmq
    !pip install regex
    !pip install responses
    !pip install rouge-score
    !pip install rsa
    !pip install scikit-learn
    !pip install scipy
    !pip install sentencepiece
    !pip install seqeval
    !pip install sympy
    !pip install threadpoolctl
    !pip install tokenizers
    !pip install torch
    !pip install torch-utils
    !pip install torchaudio
    !pip install torchdata
    !pip install torchmetrics
    !pip install torchtext
    !pip install torchvision
    !pip install transformers
    !pip install uritemplate
    !pip install webencodings
    !pip install wincertstore
    !pip install xxhash
    !pip install yarl
    print("Installed all Packages!")
    f = open("installedRepos", "w")
    f.close()
 else:
    print("Packages should be installed already. If this is incorrect, change the override and re-run.")
    package_install_override = False
 ```

 %% Output

    Packages should be installed already. If this is incorrect, change the override and re-run.

 %% Cell type:markdown id: tags:

 #WandB Login

 %% Cell type:code id: tags:

 ``` python
 !pip install wandb
 import wandb
 wandb.login()
 ```

 %% Cell type:markdown id: tags:

 Set Git Creds

 %% Cell type:code id: tags:

 ``` python
 !git config --global user.name "Ethan Aherne"
 !git config --global user.email "eca1g19@soton.ac.uk"
 !git config --get user.name
 !git config --get user.email
 ```

 %% Output

    Ethan Aherne
    eca1g19@soton.ac.uk

 %% Cell type:markdown id: tags:

 Imports

 %% Cell type:code id: tags:

 ``` python
 from sklearn.model_selection import KFold
 from torch.optim import AdamW
 from torch.nn import CrossEntropyLoss
 from torch.utils.data import DataLoader
 import scipy.stats as stats
 import evaluate
 from torch.utils.data import Dataset
 from transformers import BertTokenizerFast
 from datasets import load_dataset
 from tqdm import tqdm # This should be removed but in case it breaks everything here it will stay
 import torch.nn as nn
 import torch
 import warnings
 from tqdm.notebook import trange, tqdm
 import numpy as np
 from torch.utils.data import ConcatDataset
 import string_utils
 import train_utils
 from lightning_models import BertLightning, Seq2SeqLightning
 from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
 from lightning_fabric.loggers import CSVLogger
 import time
 import jupyter_beeper
 from base_models import BertSingleDense, BertDoubleDense, BertBiLSTM
 from pytorch_lightning import Trainer
 from pytorch_lightning.loggers import WandbLogger, wandb
 from sequence_to_sequence_models import EncoderDecoderBase, FrozenBertEncoder, SingleDenseBertDecoder, \
    DoubleDenseBertDecoder, BiLSTMBertDecoder
 ```

 %% Cell type:markdown id: tags:

 Output cuda/cpu

 %% Cell type:code id: tags:

 ``` python
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f'==============')
 print(f"RUNNING ON {device.upper()}")
 print(f'==============')
 ```

 %% Output

    ==============
    RUNNING ON CUDA
    ==============

 %% Cell type:markdown id: tags:

 Program Config

 %% Cell type:code id: tags:

 ``` python
 verbose = 1
 add_time_to_model_name = True
 ```

 %% Cell type:markdown id: tags:

 Training config

 %% Cell type:code id: tags:

 ``` python
 num_epochs = 8
 num_k_folds = 2  # For Cross-Validating to assess model performance
 batch_size = 64
 gradient_accumulation_steps = 1
 ```

 %% Cell type:markdown id: tags:

 # Dataset splits config
 Set percentages of each pre-split portion of the cnn database to use

 %% Cell type:code id: tags:

 ``` python
 train_split_percentage = 100  # percentage of SPLIT
 validate_split_percentage = 10
 test_split_percentage = 10
 ```

 %% Cell type:markdown id: tags:

 Lightning Config

 %% Cell type:code id: tags:

 ``` python
 use_lightning = True
 use_fp16 = True
-mixed_precision=16
+mixed_precision="16-mixed"
 ```

 %% Cell type:markdown id: tags:

 Print Configurations if verbose

 %% Cell type:code id: tags:

 ``` python
 def output_config():
    print(f"Program configuration:")
    print(f"Verbose Level: {verbose}")
    print(f"Adding time to model output: {add_time_to_model_name}\n")

    print(f"Dataset configuration:")
    print(f"Train Split Percentage: {train_split_percentage}")
    print(f"Validation Split Percentage: {validate_split_percentage}")
    print(f"Test Split Percentage: {test_split_percentage}\n")

    print(f"Training configuration:")
    print(f"Number of training epochs: {num_epochs}")
    print(f"Number of k-folds: {num_k_folds}")
    print(f"Batch size: {batch_size}")
    # TODO Implement
    #print(f"Gradient accumulation steps: {gradient_accumulation_steps}")
    #print(f"Effective Batch Size: {gradient_accumulation_steps * batch_size}\n")
    print(f"Mixed Precision: {mixed_precision}")

    print(f"Using Lightning: {use_lightning}")

 if verbose > 0:
  output_config()

    # TODO Implement
    #print(f"Gradient accumulation steps: {gradient_accumulation_steps}")
    #print(f"Effective Batch Size: {gradient_accumulation_steps * batch_size}\n")
 ```

 %% Output

    Program configuration:
    Verbose Level: 1
    Adding time to model output: True
    
    Dataset configuration:
    Train Split Percentage: 100
    Validation Split Percentage: 10
    Test Split Percentage: 10
    
    Training configuration:
    Number of training epochs: 8
    Number of k-folds: 2
    Batch size: 64
    Mixed Precision: 16
    Using Lightning: True

 %% Cell type:markdown id: tags:

 Tokenizer Init and config

 %% Cell type:code id: tags:

 ``` python
 tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
 ```

 %% Cell type:markdown id: tags:

 Custom CNN Dailymail Dataset Class

 %% Cell type:code id: tags:

 ``` python
 from cnn_dailymail_dataset import CNNDailyMailDataset
 ```

 %% Cell type:markdown id: tags:

 Bert Class Initialization

 %% Cell type:code id: tags:

 ``` python
 from base_models import BertSingleDense, BertDoubleDense, BertBiLSTM

 ```

 %% Cell type:markdown id: tags:

 Load Bert Tokenizer Fast

 %% Cell type:code id: tags:

 ``` python
 tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
 ```

 %% Cell type:markdown id: tags:

 Load Datasets

 %% Cell type:code id: tags:

 ``` python
 print(f"Dataset configuration:")
 print(f"Train Split Percentage: {train_split_percentage}")
 print(f"Validation Split Percentage: {validate_split_percentage}")
 print(f"Test Split Percentage: {test_split_percentage}\n")
 train_dataset = CNNDailyMailDataset(tokenizer=tokenizer, split_type='train', split_percentage=train_split_percentage, verbose=verbose)
 validation_dataset = CNNDailyMailDataset(tokenizer=tokenizer, split_type='validation',
                                         split_percentage=validate_split_percentage, verbose=verbose)
 test_dataset = CNNDailyMailDataset(tokenizer=tokenizer, split_type='test', split_percentage=test_split_percentage,
                                   verbose=verbose)
 ```

 %% Output

    Dataset configuration:
    Train Split Percentage: 100
    Validation Split Percentage: 10
    Test Split Percentage: 10
    
    Loading cnn_dailymail dataset 3.0.0 with split type: train[:100%]

    Found cached dataset cnn_dailymail (C:/Users/uwu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)

    Loading cnn_dailymail dataset 3.0.0 with split type: validation[:10%]

    Found cached dataset cnn_dailymail (C:/Users/uwu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)

    Loading cnn_dailymail dataset 3.0.0 with split type: test[:10%]

    Found cached dataset cnn_dailymail (C:/Users/uwu/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)

 %% Cell type:markdown id: tags:

 Load Datasets into dataloaders

 %% Cell type:code id: tags:

 ``` python
 train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
 validation_loader = DataLoader(validation_dataset, batch_size=batch_size)
 test_loader = DataLoader(test_dataset, batch_size=batch_size)
 ```

 %% Cell type:markdown id: tags:

 Define Model Object and config

 %% Cell type:code id: tags:

 ``` python
 ```

 %% Cell type:markdown id: tags:

 Define KFold Object, set to None if not cross validating

 %% Cell type:code id: tags:

 ``` python
 cross_validation_k_folder = KFold(n_splits=num_k_folds) if num_k_folds > 0 else None
 ```

 %% Cell type:markdown id: tags:

 Define loss function object

 %% Cell type:code id: tags:

 ``` python
 criterion = CrossEntropyLoss()
 ```

 %% Cell type:markdown id: tags:

 Load Rouge scorer

 %% Cell type:code id: tags:

 ``` python
 rouge_score = evaluate.load("rouge")
 ```

 %% Cell type:markdown id: tags:

 ## Analyze Model scores
 input is list of dictionary of scores on each

 %% Cell type:markdown id: tags:

 Train the model

 %% Cell type:code id: tags:

 ``` python
 output_config()

 num_cpus = os.cpu_count()
 train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_cpus)
 validation_loader = DataLoader(validation_dataset, batch_size=batch_size, num_workers=num_cpus)


 b = jupyter_beeper.Beeper()
 b.beep()
 tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

 # Load Datasets
 train_dataset = CNNDailyMailDataset(tokenizer=tokenizer, split_type='train', split_percentage=train_split_percentage,
                                    verbose=verbose)
 validation_dataset = CNNDailyMailDataset(tokenizer=tokenizer, split_type='validation',
                                         split_percentage=validate_split_percentage, verbose=verbose)
 test_dataset = CNNDailyMailDataset(tokenizer=tokenizer, split_type='test', split_percentage=test_split_percentage,
                                   verbose=verbose)
 encoder = FrozenBertEncoder()

 # Define Model Object
 decoders = [SingleDenseBertDecoder(),DoubleDenseBertDecoder(), BiLSTMBertDecoder()]
 # Define KFold Object, set to None if not cross validating
 cross_validation_k_fold = KFold(n_splits=num_k_folds) if num_k_folds > 0 else None
 # Define Optimizer (AdamW) - Filters to only optimize params that are not frozen (i.e. not bert)
 # Define loss function object
 criterion = nn.NLLLoss()

 num_cpus = os.cpu_count()
 num_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]

 if num_gpus>= 8:
    print("POWAAAAAA")
    strategy = "ddp_notebook"
 else:
    strategy = None


 # Load Datasets into data-loaders
 test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_cpus)
 train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_cpus)
 val_loader = DataLoader(validation_dataset, batch_size=batch_size, num_workers=num_cpus)

 output_config()
 wandb.login()

 b = jupyter_beeper.Beeper()
 b.beep()


 torch.set_float32_matmul_precision("high")

 if cross_validation_k_folder is not None:
    for fold, (train_idx, val_idx) in enumerate(cross_validation_k_folder.split(train_dataset)):
        train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
        val_sampler = torch.utils.data.SubsetRandomSampler(train_idx)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, num_workers=num_cpus)
        val_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=val_sampler, num_workers=num_cpus)

        encoder = FrozenBertEncoder()
        for decoder in decoders:
            model = EncoderDecoderBase(encoder=encoder, decoder=decoder)
            model = Seq2SeqLightning(model)
            wandb_logger = WandbLogger(name=f"seq2seq_lightning_fold_{fold}", project="seq2seq_lightning")
            checkpoint_callback = ModelCheckpoint(
                monitor='val_loss',
                dirpath=f'checkpoints_fold_{fold}',
                filename='seq2seq-{epoch:02d}-{val_loss:.2f}',
                save_top_k=3,
                mode='min',
            )
            early_stopping = EarlyStopping(monitor='val_loss', patience=3, mode='min')
            if strategy is not None:
                trainer = Trainer(
                    max_epochs=num_epochs,
                    accelerator="auto",
                    devices=len(num_gpus),
                    precision=mixed_precision if use_fp16 else 32,
                    logger=wandb_logger,
                    callbacks=[checkpoint_callback, early_stopping],
                    strategy=strategy
                )
            else:
                trainer = Trainer(
                    max_epochs=num_epochs,
                    accelerator="auto",
                    devices=len(num_gpus),
                    precision=mixed_precision if use_fp16 else 32,
                    logger=wandb_logger,
                    callbacks=[checkpoint_callback, early_stopping],
                )
            trainer.fit(model, train_loader, val_loader)
 else:
    for decoder in decoders:
        model = EncoderDecoderBase(encoder=encoder, decoder=decoder)
        model = Seq2SeqLightning(model)
        wandb_logger = WandbLogger(name="seq2seq_lightning_run", project="seq2seq_lightning")

        checkpoint_callback = ModelCheckpoint(
            monitor='val_loss',
            dirpath='checkpoints',
            filename='seq2seq-{epoch:02d}-{val_loss:.2f}',
            save_top_k=3,
            mode='min',
        )

        early_stopping = EarlyStopping(monitor='val_loss', patience=3, mode='min')
        if strategy is not None:
            trainer = Trainer(
                max_epochs=num_epochs,
                accelerator="auto",
                devices=len(num_gpus),
                precision=mixed_precision if use_fp16 else 32,
                logger=wandb_logger,
                callbacks=[checkpoint_callback, early_stopping],
                strategy=strategy
            )
        else:
            trainer = Trainer(
                max_epochs=num_epochs,
                accelerator="auto",
                devices=len(num_gpus),
                precision=mixed_precision if use_fp16 else 32,
                logger=wandb_logger,
                callbacks=[checkpoint_callback, early_stopping],
            )
        trainer.fit(model, train_loader, val_loader)
        b.beep()
 wandb.finish()
 ```

 %% Output

    Program configuration:
    Verbose Level: 1
    Adding time to model output: True
    
    Dataset configuration:
    Train Split Percentage: 100
    Validation Split Percentage: 10
    Test Split Percentage: 10
    
    Training configuration:
    Number of training epochs: 8
    Number of k-folds: 2
    Batch size: 64
    Mixed Precision: 16
    Using Lightning: True


    Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
    - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
    - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
    Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
    - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
    - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
    Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
    - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
    - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

    Training BertBiLSTM
    Available GPUs: 1

    C:\Users\uwu\miniconda3\envs\uni\lib\site-packages\lightning_fabric\connector.py:555: UserWarning: 16 is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
      rank_zero_warn(
    Using 16bit Automatic Mixed Precision (AMP)
    GPU available: True (cuda), used: True
    TPU available: False, using: 0 TPU cores
    IPU available: False, using: 0 IPUs
    HPU available: False, using: 0 HPUs
    C:\Users\uwu\miniconda3\envs\uni\lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:67: UserWarning: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
      warning_cache.warn(
    C:\Users\uwu\miniconda3\envs\uni\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:615: UserWarning: Checkpoint directory C:\Users\uwu\PycharmProjects\COMP3200\Models exists and is not empty.
      rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
    LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
    
      | Name      | Type       | Params
    -----------------------------------------
    0 | model     | BertBiLSTM | 165 M
    1 | criterion | NLLLoss    | 0
    -----------------------------------------
    56.4 M    Trainable params
    109 M     Non-trainable params
    165 M     Total params
    663.376   Total estimated model params size (MB)


    C:\Users\uwu\miniconda3\envs\uni\lib\site-packages\pytorch_lightning\trainer\call.py:52: UserWarning: Detected KeyboardInterrupt, attempting graceful shutdown...
      rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
    Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x0000020B93049EE0>
    Traceback (most recent call last):
      File "C:\Users\uwu\miniconda3\envs\uni\lib\site-packages\torch\utils\data\dataloader.py", line 1478, in __del__
        self._shutdown_workers()
      File "C:\Users\uwu\miniconda3\envs\uni\lib\site-packages\torch\utils\data\dataloader.py", line 1436, in _shutdown_workers
        if self._persistent_workers or self._workers_status[worker_id]:
    AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status'
    C:\Users\uwu\miniconda3\envs\uni\lib\site-packages\lightning_fabric\connector.py:555: UserWarning: 16 is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
      rank_zero_warn(

    Training BertDoubleDense
    Available GPUs: 1

    Using 16bit Automatic Mixed Precision (AMP)
    GPU available: True (cuda), used: True
    TPU available: False, using: 0 TPU cores
    IPU available: False, using: 0 IPUs
    HPU available: False, using: 0 HPUs
    LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
    
      | Name      | Type            | Params
    ----------------------------------------------
    0 | model     | BertDoubleDense | 133 M
    1 | criterion | NLLLoss         | 0
    ----------------------------------------------
    24.1 M    Trainable params
    109 M     Non-trainable params
    133 M     Total params
    534.177   Total estimated model params size (MB)


 %% Cell type:markdown id: tags:

 Load a model from a checkpoint (debugging from here):

 %% Cell type:code id: tags:

 ``` python
 model = BertLightning(BertSingleDense())
 test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_cpus)
 batch_size = 8 # I am a mortal on local machine
 trainer = Trainer(devices=len(num_gpus),
            accelerator="auto",
            precision="16",
            load_from_checkpoint="Models/epoch=7-val_loss=0.86-rouge=0.00.ckpt")
 trainer.test(model, test_loader)
 ```

 %% Output

      Cell In [7], line 6
        precision="16",load_from_checkpoint("Models/epoch=7-val_loss=0.86-rouge=0.00.ckpt"))
                                                                                           ^
    SyntaxError: positional argument follows keyword argument

 %% Cell type:code id: tags:

 ``` python
 def evaluate_model_and_debug(model, data_loader, scorer, tokenizer, device='cpu'):
    # Set model to eval mode
    model.eval()

    # Init score tracker
    score_totals = []

    # Disable gradients for evaluation - performance
    with torch.no_grad():
        # Init tqdm
        desc_string = "Evaluation"
        progress_bar = tqdm(data_loader, desc=desc_string)

        # Iterate over each batch
        for batch_id, batch in enumerate(progress_bar):
            # Unpack batch into inputs and outputs
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Run Forward Pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Greedy decode for now
            max_values, predicted_indices = torch.max(outputs, dim=-1)
            predicted_tokens = [[tokenizer.convert_ids_to_tokens(idx.item()) for idx in seq] for seq in predicted_indices]


            # Argmax Decode
            outputs_decoded = [tokenizer.decode(o, skip_special_tokens=True) for o in predicted_indices]
            labels_decoded = [tokenizer.decode(l, skip_special_tokens=True) for l in labels]

            print(f"outputs shape: {outputs.shape}")
            print(f"labels shape: {labels.shape}")
            print(f"predicted_indices shape: {predicted_indices.shape}")
            print(f"predicted_indices[0] shape: {predicted_indices[0].shape}")


            # Calculate performance score
            score = scorer.compute(predictions=outputs_decoded, references=labels_decoded)
            score_totals.append(score)

            # Updates progress bar text
            progress_bar.set_postfix({f'Batch {batch_id} Score': score})

    # Return scores list of dictionaries
    return score_totals
 ```

 %% Cell type:code id: tags:

 ``` python
 model = BertSingleDense().to(device)
 checkpoint_path = "/content/Models/BertSingleDense/BertSingleDense_best.pt"
 checkpoint_dir = os.path.dirname(checkpoint_path)
 model.load_state_dict(torch.load(checkpoint_path))

 tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
 rouge_score = evaluate.load("rouge")
 validation_dataset = CNNDailyMailDataset(tokenizer=tokenizer, split_type='validation',
                                         split_percentage=1, verbose=verbose)
 validation_data_loader = DataLoader(validation_dataset, batch_size=16)

 model_evaluate_scores = evaluate_model_and_debug(model, validation_data_loader,
                                       rouge_score, tokenizer, device=device)

 print(model_evaluate_scores)
 ```