Skip to content
Snippets Groups Projects

Draft: Testing

Closed eca1g19 requested to merge testing into main
1 file
+ 199
0
Compare changes
  • Side-by-side
  • Inline
+ 199
0
import torch
import datasets
import utils
import evaluate
from transformers import BertTokenizerFast, BertGenerationConfig
from transformers import EncoderDecoderModel
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from nltk.tokenize import sent_tokenize
import numpy as np
if(torch.cuda.is_available() != True):
print("RUNNING ON CPU")
else:
print("RUNNING ON GPU")
rogue_score = evaluate.load("rouge")
FINETUNE_ON_CUSTOM_DATASET = False
NEW_MODEL = False
generation_config = BertGenerationConfig(
max_length=142,
min_length=56,
no_repeat_ngram_size=3,
early_stopping=True,
length_penalty=2.0,
num_beams=4
)
#Loads model
if(NEW_MODEL):
model_checkpoint = 'bert-base-uncased'
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
model_checkpoint,
model_checkpoint)
else:
model_checkpoint = r'C:\Users\uwu\PycharmProjects\COMP3200\bert-base-uncased-cnn-dailymail'
model = EncoderDecoderModel.from_pretrained(model_checkpoint)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token
#Set tokens
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
#Beam search parameters
model.config.vocab_size = model.config.decoder.vocab_size
model.config.max_length = 142
model.config.min_length = 56
model.config.no_repeat_ngram_size = 3
model.config.early_stopping = True
model.config.length_penalty = 2.0
model.config.num_beams = 4
if(FINETUNE_ON_CUSTOM_DATASET):
train_data = utils.load_custom_dataset()
validation_data = utils.load_custom_dataset()
else:
train_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train")
validation_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:10%]")
#Args
max_input_length = 512
max_target_length = 30
batch_size = 8
num_train_epochs = 8
model_name = model_checkpoint.split("/")[-1]
#logging_steps = len(train_data) // batch_size
print("Training Data Length: ", len(train_data))
print("Eval Data Length: ", len(validation_data))
if(FINETUNE_ON_CUSTOM_DATASET):
args = Seq2SeqTrainingArguments(
output_dir=f"{model_name}-finetuned-custom-dataset",
evaluation_strategy="epoch",
learning_rate=5.6e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.01,
save_total_limit=3,
num_train_epochs=num_train_epochs,
predict_with_generate=True,
push_to_hub=True,
)
else:
args = Seq2SeqTrainingArguments(
output_dir=f"{model_name}",
evaluation_strategy="steps",
learning_rate=5.6e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.01,
save_total_limit=3,
num_train_epochs=num_train_epochs,
predict_with_generate=True,
push_to_hub=True,
)
def preprocess_data(batch):
#tokenize the inputs
inputs = tokenizer(batch["text"], max_length=max_input_length, truncation=True,padding="max_length")
targets = tokenizer(batch["ingredients"], max_length=max_target_length, truncation=True, padding="max_length")
batch["input_ids"] = inputs.input_ids
batch["attention_mask"] = inputs.attention_mask
batch["decoder_input_ids"] = targets.input_ids
batch["decoder_attention_mask"] = targets.attention_mask
batch["labels"] = targets.input_ids.copy()
batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in
batch["labels"]]
return batch
def preprocess_data_cnn_dailymail(batch):
#tokenize the input
inputs = tokenizer(batch["article"], max_length=max_input_length, truncation=True,padding="max_length")
targets = tokenizer(batch["highlights"], max_length=max_target_length, truncation=True, padding="max_length")
batch["input_ids"] = inputs.input_ids
batch["attention_mask"] = inputs.attention_mask
batch["decoder_input_ids"] = targets.input_ids
batch["decoder_attention_mask"] = targets.attention_mask
batch["labels"] = targets.input_ids.copy()
#batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]
return batch
def compute_metrics(eval_pred):
print("Eval_pred Label IDs: ", eval_pred.label_ids)
print("Preds_ids: ", eval_pred.predictions)
predictions, labels = eval_pred
print("Predictions: ", predictions)
#Decode summaries into text
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#Ignore pad token
labels = np.where(labels !=100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
#ROGUE requires newline after each sentence
decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
print("Preds: ", decoded_preds)
print("Labels: ", decoded_labels)
result = rogue_score.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
# Extract the median scores
try:
result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
except AttributeError:
result = {key: value * 100 for key, value in result.items()}
print("Result: ", result)
return {k: round(v, 4) for k, v in result.items()}
if(FINETUNE_ON_CUSTOM_DATASET):
#Preprocess data and change formatting
train_data = train_data.map(
preprocess_data,
batched=True,
batch_size=batch_size,
remove_columns=["text","ingredients"]#maybe keep these and test
)
validation_data = validation_data.map(
preprocess_data,
batched=True,
batch_size=batch_size,
remove_columns=["text", "ingredients"] # maybe keep these and test
)
else:
train_data = train_data.map(
preprocess_data_cnn_dailymail,
batched=True,
batch_size=batch_size,
remove_columns=['article', 'highlights', 'id']#maybe keep these and test
)
validation_data = validation_data.map(
preprocess_data_cnn_dailymail,
batched=True,
batch_size=batch_size,
remove_columns=['article', 'highlights', 'id'] # maybe keep these and test
)
train_data.set_format(type="torch")#May need to specify column names
validation_data.set_format(type="torch")
trainer = Seq2SeqTrainer(
model=model,
args=args,
compute_metrics=compute_metrics,
train_dataset=train_data,
eval_dataset=validation_data,
)
trainer.train()
\ No newline at end of file
Loading