Skip to content
Snippets Groups Projects
Commit 6b2cb6df authored by Liam Byrne's avatar Liam Byrne
Browse files

implementing heterogenous GNN manually

parent 2e26cfaf
No related branches found
No related tags found
No related merge requests found
...@@ -12,7 +12,8 @@ class GAT(torch.nn.Module): ...@@ -12,7 +12,8 @@ class GAT(torch.nn.Module):
self.conv1 = GATConv((-1, -1), hidden_channels, add_self_loops=False) self.conv1 = GATConv((-1, -1), hidden_channels, add_self_loops=False)
self.conv2 = GATConv((-1, -1), hidden_channels, add_self_loops=False) self.conv2 = GATConv((-1, -1), hidden_channels, add_self_loops=False)
self.conv3 = GATConv((-1, -1), hidden_channels, add_self_loops=False) self.conv3 = GATConv((-1, -1), hidden_channels, add_self_loops=False)
self.lin = Linear(hidden_channels, 3) self.lin = Linear(hidden_channels, 2)
self.softmax = torch.nn.Softmax(dim=1)
self.pool = MeanAggregation() self.pool = MeanAggregation()
def forward(self, x, edge_index, batch, post_emb): def forward(self, x, edge_index, batch, post_emb):
...@@ -30,7 +31,7 @@ class GAT(torch.nn.Module): ...@@ -30,7 +31,7 @@ class GAT(torch.nn.Module):
# 3. Apply a final classifier # 3. Apply a final classifier
#x = F.dropout(x, p=0.5, training=self.training) #x = F.dropout(x, p=0.5, training=self.training)
x = self.lin(x) x = self.lin(x)
#x = self.softmax(x)
return x return x
...@@ -38,8 +39,11 @@ def train(model, train_loader): ...@@ -38,8 +39,11 @@ def train(model, train_loader):
model.train() model.train()
for data in train_loader: # Iterate in batches over the training dataset. for data in train_loader: # Iterate in batches over the training dataset.
out = model(data.x, data.edge_index, data.batch) # Perform a single forward pass. print(data)
loss = criterion(out, data.y) # Compute the loss.
out = model(data.x_dict, data.edge_index_dict, data.batch_dict, torch.concat([data.question_emb, data.answer_emb])) # Perform a single forward pass.
print(out, data.label)
loss = criterion(out, data.label) # Compute the loss.
loss.backward() # Derive gradients. loss.backward() # Derive gradients.
optimizer.step() # Update parameters based on gradients. optimizer.step() # Update parameters based on gradients.
optimizer.zero_grad() # Clear gradients. optimizer.zero_grad() # Clear gradients.
...@@ -57,25 +61,25 @@ def test(loader): ...@@ -57,25 +61,25 @@ def test(loader):
if __name__ == '__main__': if __name__ == '__main__':
dataset = UserGraphDataset(root="data") dataset = UserGraphDataset(root="../data", skip_processing=True)
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [0.6, 0.1, 0.3]) train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [0.6, 0.1, 0.3])
train_loader = DataLoader(train_dataset, batch_size=64) print(dataset.num_node_features)
train_loader = DataLoader(train_dataset, batch_size=1)
val_loader = DataLoader(val_dataset, batch_size=64) val_loader = DataLoader(val_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64) test_loader = DataLoader(test_dataset, batch_size=64)
model = GAT(hidden_channels=64) model = GAT(hidden_channels=64)
sample = train_dataset[0]
example_graph = train_loader[0] metadata = (['question', 'answer', 'comment', 'tag', 'module'], [('tag', 'describes', 'question'), ('tag', 'describes', 'answer'), ('tag', 'describes', 'comment'), ('module', 'imported_in', 'question'), ('module', 'imported_in', 'answer'), ('question', 'rev_describes', 'tag'), ('answer', 'rev_describes', 'tag'), ('comment', 'rev_describes', 'tag'), ('question', 'rev_imported_in', 'module'), ('answer', 'rev_imported_in', 'module')])
print(example_graph) model = to_hetero(model, metadata, aggr='sum')
model = to_hetero(model, example_graph.metadata()) #print(model(sample.x_dict, sample.edge_index_dict, sample.batch_dict))
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss() criterion = torch.nn.CrossEntropyLoss()
for epoch in range(1, 10): for epoch in range(1, 10):
train(model, train_loader) train(model, train_loader)
train_acc = test(train_loader) #train_acc = test(train_loader)
print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}') #print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')
File added
No preview for this file type
No preview for this file type
import logging import logging
import os.path import os.path
import pickle import pickle
import re
import sqlite3 import sqlite3
from typing import List
import pandas as pd import pandas as pd
import torch import torch
from bs4 import MarkupResemblesLocatorWarning from bs4 import MarkupResemblesLocatorWarning
from torch_geometric.data import Dataset, download_url, Data from torch_geometric.data import Dataset, download_url, Data, HeteroData
from tqdm import tqdm from tqdm import tqdm
import warnings import warnings
warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning) warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning)
...@@ -20,12 +22,14 @@ log = logging.getLogger("dataset") ...@@ -20,12 +22,14 @@ log = logging.getLogger("dataset")
class UserGraphDataset(Dataset): class UserGraphDataset(Dataset):
def __init__(self, root, transform=None, pre_transform=None, pre_filter=None, db_address:str=None, question_count=70000): def __init__(self, root, transform=None, pre_transform=None, pre_filter=None, db_address:str=None, skip_processing=False):
self._question_count = question_count self._skip_processing = skip_processing
# Connect to database. # Connect to database.
if db_address is not None: if db_address is not None:
self._db = sqlite3.connect(db_address) self._db = sqlite3.connect(db_address)
self._post_embedding_builder = PostEmbedding() self._post_embedding_builder = PostEmbedding()
# Call init last, as it may trigger the process function.
super().__init__(root, transform, pre_transform, pre_filter) super().__init__(root, transform, pre_transform, pre_filter)
@property @property
...@@ -34,63 +38,94 @@ class UserGraphDataset(Dataset): ...@@ -34,63 +38,94 @@ class UserGraphDataset(Dataset):
@property @property
def processed_file_names(self): def processed_file_names(self):
return os.listdir("data/processed") if self._skip_processing:
return os.listdir("../data/processed")
return []
def download(self): def download(self):
pass pass
def get_unprocessed_ids(self):
# Load IDs of questions to use.
with open("../data/raw/valid_questions.pkl", 'rb') as f:
question_ids = pickle.load(f)
processed = []
max_idx = -1
for f in os.listdir("../data/processed"):
question_id_search = re.search(r"id_(\d+)", f)
if question_id_search:
processed.append(int(question_id_search.group(1)))
idx_search = re.search(r"data_(\d+)", f)
if idx_search:
next_idx = int(idx_search.group(1))
if next_idx > max_idx:
max_idx = int(idx_search.group(1))
# Fetch question ids that have not been processed yet.
unprocessed = [q_id for q_id in question_ids if q_id not in processed]
return unprocessed, max_idx+1
def process(self): def process(self):
idx = 0 """
valid_questions = self.fetch_valid_questions() """
log.info("Processing data...")
# Fetch the unprocessed questions and the next index to use.
unprocessed, idx = self.get_unprocessed_ids()
print(unprocessed, idx)
# Fetch questions from database.
valid_questions = self.fetch_questions_by_post_ids(unprocessed)
for row in tqdm(valid_questions.itertuples(), total=len(valid_questions)): for row in tqdm(valid_questions.itertuples(), total=len(valid_questions)):
# Build Question embedding # Build Question embedding
question_emb = self._post_embedding_builder( question_word_emb, question_code_emb, _ = self._post_embedding_builder(
row.question_body, row.question_body,
use_bert=True, use_bert=True,
title=row.question_title title=row.question_title
) )
question_emb = torch.concat((question_word_emb, question_code_emb))
# Fetch answers to question
answers_to_question = self.fetch_answers_for_question(row.post_id) answers_to_question = self.fetch_answers_for_question(row.post_id)
# Build Answer embeddings # Build Answer embeddings
for _, answer_body, answer_user_id, score in answers_to_question.itertuples(): for _, answer_body, answer_user_id, score in answers_to_question.itertuples():
answer_emb = self._post_embedding_builder( label = torch.tensor([1 if score > 0 else 0], dtype=torch.long)
answer_body, answer_word_emb, answer_code_emb, _ = self._post_embedding_builder(
use_bert=True answer_body, use_bert=True
) )
answer_emb = torch.concat((answer_word_emb, answer_code_emb))
# Build graph # Build graph
graph = self.construct_graph(answer_user_id) graph: HeteroData = self.construct_graph(answer_user_id)
# pytorch geometric data object # pytorch geometric data object
data = Data( graph.__setattr__('question_emb', question_emb)
x=graph.x_dict, graph.__setattr__('answer_emb', answer_emb)
edge_index=graph.edge_index_dict, graph.__setattr__('label', label)
y=torch.LongTensor(1 if score > 0 else 0), torch.save(graph, os.path.join(self.processed_dir, f'data_{idx}_question_id_{row.post_id}'))
question_emb=question_emb,
answer_emb=answer_emb
)
torch.save(data, os.path.join(self.processed_dir, f'data_{idx}.pt'))
idx += 1 idx += 1
def len(self): def len(self):
return len(self.processed_file_names) return len(self.processed_file_names)
def get(self, idx): def get(self, idx):
data = torch.load(os.path.join(self.processed_dir, f'data_{idx}.pt')) file_name = [filename for filename in os.listdir('../data/processed/') if filename.startswith(f"data_{idx}")]
if len(file_name):
data = torch.load(os.path.join(self.processed_dir, file_name[0]))
return data return data
else:
raise Exception(f"Data with index {idx} not found.")
''' '''
Database functions Database functions
''' '''
def fetch_valid_questions(self): def fetch_questions_by_post_ids(self, post_ids: List[int]):
valid_questions = pd.read_sql_query(f""" questions_df = pd.read_sql_query(f"""
SELECT Q.PostId, Q.Body, Q.Title, Q.OwnerUserId FROM Post Q SELECT PostId, Body, Title, OwnerUserId FROM Post
INNER JOIN Post A ON Q.PostId = A.ParentId WHERE PostId IN ({','.join([str(x) for x in post_ids])})
WHERE (Q.Tags LIKE '%<python>%')
GROUP BY A.ParentId
HAVING SUM(A.Score) > 15
LIMIT {self._question_count}
""", self._db) """, self._db)
valid_questions.columns = ['post_id', 'question_body', 'question_title', 'question_user_id'] questions_df.columns = ['post_id', 'question_body', 'question_title', 'question_user_id']
return valid_questions return questions_df
def fetch_questions_by_user(self, user_id: int): def fetch_questions_by_user(self, user_id: int):
questions_df = pd.read_sql_query(f""" questions_df = pd.read_sql_query(f"""
...@@ -150,5 +185,18 @@ class UserGraphDataset(Dataset): ...@@ -150,5 +185,18 @@ class UserGraphDataset(Dataset):
if __name__ == '__main__': if __name__ == '__main__':
ds = UserGraphDataset('../data/', db_address='../stackoverflow.db', question_count=100) '''
print(ds.get(0)) Build List of question post_ids.
\ No newline at end of file This will be fed into the Dataset class to construct the graphs.
This setup allows us to have a fixed set of questions/answers
for each dataset (rather than selecting new questions each time).
'''
ds = UserGraphDataset('../data/', db_address='../stackoverflow.db', skip_processing=False)
data = ds.get(1)
print("Question ndim:", data.x_dict['question'].dim())
print("Answer ndim:", data.x_dict['answer'].dim())
print("Comment ndim:", data.x_dict['comment'].dim())
print("Tag ndim:", data.x_dict['tag'].dim())
print("Module ndim:", data.x_dict['module'].dim())
\ No newline at end of file
...@@ -113,14 +113,14 @@ class PostEmbedding(nn.Module): ...@@ -113,14 +113,14 @@ class PostEmbedding(nn.Module):
return torch.sum(word_embeddings, dim=0) / len(tokens) return torch.sum(word_embeddings, dim=0) / len(tokens)
def to_bert_embedding(self, text: str) -> torch.tensor: def to_bert_embedding(self, text: str) -> torch.tensor:
sentences = [i.text for i in self._en(text).sents] # if not len(text):
if not len(sentences): # return torch.zeros(768)
return torch.zeros(768) encodings = self._bert_tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=512)
encodings = self._bert_tokenizer(sentences, padding=True, truncation=True, return_tensors='pt', max_length=512)
with torch.no_grad(): with torch.no_grad():
outputs = self._bert_model(**encodings, output_hidden_states=True) outputs = self._bert_model(**encodings)
cls = outputs.hidden_states[-1][0,0,:] last_layer = outputs.last_hidden_state
return cls cls = last_layer[:, 0, :]
return torch.squeeze(cls) # Converts from dim [1, 768] to [768]
def to_code_bert_embedding(self, code): def to_code_bert_embedding(self, code):
...@@ -215,5 +215,5 @@ class PostEmbedding(nn.Module): ...@@ -215,5 +215,5 @@ class PostEmbedding(nn.Module):
if __name__ == '__main__': if __name__ == '__main__':
pe = PostEmbedding() pe = PostEmbedding()
#print(pe.to_code_bert_embedding("\n".join(["for i in range(32):\n #return 6 or something\n"])).shape) #print(pe.to_code_bert_embedding("\n".join(["for i in range(32):\n #return 6 or something\n"])).shape)
print(pe.to_bert_embedding("This is a test sentence.")) print(pe.to_bert_embedding("This is a test sentence.").shape)
#print([x.module for x in pe.get_imports_via_regex(BeautifulSoup("<code>import ast<\code>", 'lxml'))]) #print([x.module for x in pe.get_imports_via_regex(BeautifulSoup("<code>import ast<\code>", 'lxml'))])
This diff is collapsed.
...@@ -9,6 +9,7 @@ import torch ...@@ -9,6 +9,7 @@ import torch
from torch_geometric.data import HeteroData from torch_geometric.data import HeteroData
from post_embedding_builder import Import, PostEmbedding from post_embedding_builder import Import, PostEmbedding
import torch_geometric.transforms as T
class StaticGraphConstruction: class StaticGraphConstruction:
...@@ -75,7 +76,7 @@ class StaticGraphConstruction: ...@@ -75,7 +76,7 @@ class StaticGraphConstruction:
for module in modules: for module in modules:
self._module_to_comment_edges.append((self._known_modules[module], i)) self._module_to_comment_edges.append((self._known_modules[module], i))
yield torch.concat((word_embedding, code_embedding)) yield word_embedding
def process_tags(self): def process_tags(self):
if not len(self._known_tags): if not len(self._known_tags):
...@@ -121,21 +122,27 @@ class StaticGraphConstruction: ...@@ -121,21 +122,27 @@ class StaticGraphConstruction:
tag_nodes = list(self.process_tags()) tag_nodes = list(self.process_tags())
module_nodes = list(self.process_modules()) module_nodes = list(self.process_modules())
if len(question_nodes): # Assign node features
self._data['question'].x = torch.stack(question_nodes) self._data['question'].x = torch.stack(question_nodes) if len(question_nodes) else torch.empty(0, 1536)
if len(answer_nodes):
self._data['answer'].x = torch.stack(answer_nodes) self._data['answer'].x = torch.stack(answer_nodes) if len(answer_nodes) else torch.empty(0, 1536)
if len(comment_nodes):
self._data['comment'].x = torch.stack(comment_nodes) self._data['comment'].x = torch.stack(comment_nodes) if len(comment_nodes) else torch.empty(0, 768)
if len(tag_nodes):
self._data['tag'].x = torch.stack(tag_nodes) self._data['tag'].x = torch.stack(tag_nodes) if len(tag_nodes) else torch.empty(0, 90)
if len(module_nodes):
self._data['module'].x = torch.stack(module_nodes) self._data['module'].x = torch.stack(module_nodes) if len(module_nodes) else torch.empty(0, 110)
self._data['tag', 'describes', 'question'].edge_index = torch.tensor(self._tag_to_question_edges).t().contiguous() # Assign edge indexes
self._data['tag', 'describes', 'answer'].edge_index = torch.tensor(self._tag_to_answer_edges).t().contiguous() self._data['tag', 'describes', 'question'].edge_index = torch.tensor(self._tag_to_question_edges).t().contiguous() if len(self._tag_to_question_edges) else torch.empty(2,0, dtype=torch.long)
self._data['tag', 'describes', 'comment'].edge_index = torch.tensor(self._tag_to_comment_edges).t().contiguous() self._data['tag', 'describes', 'answer'].edge_index = torch.tensor(self._tag_to_answer_edges).t().contiguous() if len(self._tag_to_answer_edges) else torch.empty(2,0, dtype=torch.long)
self._data['module', 'imported_in', 'question'].edge_index = torch.tensor(self._module_to_question_edges).t().contiguous() self._data['tag', 'describes', 'comment'].edge_index = torch.tensor(self._tag_to_comment_edges).t().contiguous() if len(self._tag_to_comment_edges) else torch.empty(2,0, dtype=torch.long)
self._data['module', 'imported_in', 'answer'].edge_index = torch.tensor(self._module_to_answer_edges).t().contiguous() self._data['module', 'imported_in', 'question'].edge_index = torch.tensor(self._module_to_question_edges).t().contiguous() if len(self._module_to_question_edges) else torch.empty(2,0, dtype=torch.long)
self._data['module', 'imported_in', 'answer'].edge_index = torch.tensor(self._module_to_answer_edges).t().contiguous() if len(self._module_to_answer_edges) else torch.empty(2,0, dtype=torch.long)
return self._data
# Remove isolated nodes, and convert to undirected graph
graph_out = T.remove_isolated_nodes.RemoveIsolatedNodes()(self._data)
graph_out = T.ToUndirected()(graph_out)
graph_out.metadata()
return graph_out
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import sqlite3 import sqlite3
import pandas as pd import pandas as pd
import logging import logging
import pickle
logging.basicConfig() logging.basicConfig()
logging.getLogger().setLevel(logging.INFO) logging.getLogger().setLevel(logging.INFO)
log = logging.getLogger("training-set-builder") log = logging.getLogger("training-set-builder")
# Create your connection. # Create your connection.
db = sqlite3.connect('../stackoverflow.db') db = sqlite3.connect('../stackoverflow.db')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
QUESTIONS_RETREIVED = 12 QUESTIONS_RETREIVED = 12
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
valid_questions = pd.read_sql_query(f""" valid_questions = pd.read_sql_query(f"""
SELECT Q.PostId, Q.Body, Q.Title, Q.OwnerUserId FROM Post Q SELECT Q.PostId, Q.Body, Q.Title, Q.OwnerUserId FROM Post Q
INNER JOIN Post A ON Q.PostId = A.ParentId INNER JOIN Post A ON Q.PostId = A.ParentId
WHERE (Q.Tags LIKE '%<python>%') WHERE (Q.Tags LIKE '%<python>%')
GROUP BY A.ParentId GROUP BY A.ParentId
HAVING SUM(A.Score) > 15 HAVING SUM(A.Score) > 15
LIMIT {QUESTIONS_RETREIVED} LIMIT {QUESTIONS_RETREIVED}
""", db) """, db)
valid_questions.columns = ['post_id', 'question_body', 'question_title', 'question_user_id'] valid_questions.columns = ['post_id', 'question_body', 'question_title', 'question_user_id']
valid_questions valid_questions
``` ```
%% Output %% Output
post_id question_body \ post_id question_body \
0 337 <p>I am about to build a piece of a project th... 0 337 <p>I am about to build a piece of a project th...
1 469 <p>I am using the Photoshop's javascript API t... 1 469 <p>I am using the Photoshop's javascript API t...
2 502 <p>I have a cross-platform (Python) applicatio... 2 502 <p>I have a cross-platform (Python) applicatio...
3 535 <p>I am starting to work on a hobby project wi... 3 535 <p>I am starting to work on a hobby project wi...
4 594 <p>There are several ways to iterate over a re... 4 594 <p>There are several ways to iterate over a re...
5 683 <p>I don't remember whether I was dreaming or ... 5 683 <p>I don't remember whether I was dreaming or ...
6 742 <p><a href="http://www.djangoproject.com/" rel... 6 742 <p><a href="http://www.djangoproject.com/" rel...
7 766 <p>I can get Python to work with Postgresql bu... 7 766 <p>I can get Python to work with Postgresql bu...
8 773 <p>I haven't been able to find an understandab... 8 773 <p>I haven't been able to find an understandab...
9 972 <p>I've read that it is possible to add a meth... 9 972 <p>I've read that it is possible to add a meth...
10 1171 <p>I need to be able to manipulate a large (10... 10 1171 <p>I need to be able to manipulate a large (10...
11 1476 <p>How do you express an integer as a binary n... 11 1476 <p>How do you express an integer as a binary n...
question_title question_user_id question_title question_user_id
0 XML Processing in Python 111 0 XML Processing in Python 111
1 How can I find the full path to a font from it... 147 1 How can I find the full path to a font from it... 147
2 Get a preview JPEG of a PDF on Windows? 147 2 Get a preview JPEG of a PDF on Windows? 147
3 Continuous Integration System for a Python Cod... 154 3 Continuous Integration System for a Python Cod... 154
4 cx_Oracle: How do I iterate over a result set? 116 4 cx_Oracle: How do I iterate over a result set? 116
5 Using 'in' to match an attribute of Python obj... 199 5 Using 'in' to match an attribute of Python obj... 199
6 Class views in Django 189 6 Class views in Django 189
7 Python and MySQL 1384652 7 Python and MySQL 1384652
8 How do I use itertools.groupby()? 207 8 How do I use itertools.groupby()? 207
9 Adding a method to an existing object instance 145 9 Adding a method to an existing object instance 145
10 What is the most efficient graph data structur... 280 10 What is the most efficient graph data structur... 280
11 How do you express binary literals in Python? 92 11 How do you express binary literals in Python? 92
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
save = True
if save:
with open("../data/raw/valid_questions.pkl", "wb") as f:
pickle.dump(valid_questions['post_id'].to_list(), f)
```
%% Cell type:code id: tags:
``` python
def fetch_questions_by_user(user_id: int, db): def fetch_questions_by_user(user_id: int, db):
questions_df = pd.read_sql_query(f""" questions_df = pd.read_sql_query(f"""
SELECT * SELECT *
FROM Post FROM Post
WHERE Tags LIKE '%python%' AND (PostTypeId = 1) AND ((LastEditorUserId = {user_id}) OR (OwnerUserId = {user_id})) WHERE Tags LIKE '%python%' AND (PostTypeId = 1) AND ((LastEditorUserId = {user_id}) OR (OwnerUserId = {user_id}))
""", db) """, db)
questions_df.set_index('PostId', inplace=True) questions_df.set_index('PostId', inplace=True)
return questions_df return questions_df
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def fetch_answers_by_user(user_id: int, db): def fetch_answers_by_user(user_id: int, db):
answers_df = pd.read_sql_query(f""" answers_df = pd.read_sql_query(f"""
SELECT A.Tags, B.* SELECT A.Tags, B.*
FROM Post A FROM Post A
INNER JOIN Post B ON (B.ParentId = A.PostId) AND (B.ParentId IS NOT NULL) INNER JOIN Post B ON (B.ParentId = A.PostId) AND (B.ParentId IS NOT NULL)
WHERE A.Tags LIKE '%python%' AND (B.PostTypeId = 2) AND ((B.LastEditorUserId = {user_id}) OR (B.OwnerUserId = {user_id})) WHERE A.Tags LIKE '%python%' AND (B.PostTypeId = 2) AND ((B.LastEditorUserId = {user_id}) OR (B.OwnerUserId = {user_id}))
""", db) """, db)
answers_df = answers_df.loc[:, ~answers_df.columns.duplicated()].copy() answers_df = answers_df.loc[:, ~answers_df.columns.duplicated()].copy()
answers_df.set_index('PostId', inplace=True) answers_df.set_index('PostId', inplace=True)
return answers_df return answers_df
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def fetch_answers_for_question(question_post_id: int, db): def fetch_answers_for_question(question_post_id: int, db):
answers_df = pd.read_sql_query(f""" answers_df = pd.read_sql_query(f"""
SELECT Body, OwnerUserId, Score SELECT Body, OwnerUserId, Score
FROM Post FROM Post
WHERE ParentId = {question_post_id} WHERE ParentId = {question_post_id}
""", db) """, db)
answers_df = answers_df.dropna() answers_df = answers_df.dropna()
return answers_df return answers_df
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def fetch_comments_by_user(user_id: int, db): def fetch_comments_by_user(user_id: int, db):
comments_on_questions_df = pd.read_sql_query(f""" comments_on_questions_df = pd.read_sql_query(f"""
SELECT A.Tags, B.* SELECT A.Tags, B.*
FROM Post A FROM Post A
INNER JOIN Comment B ON (B.PostId = A.PostId) INNER JOIN Comment B ON (B.PostId = A.PostId)
WHERE A.Tags LIKE '%python%' AND (B.UserId = {user_id}) AND (A.PostTypeId = 1) WHERE A.Tags LIKE '%python%' AND (B.UserId = {user_id}) AND (A.PostTypeId = 1)
""", db) """, db)
comments_on_questions_df.set_index('CommentId', inplace=True) comments_on_questions_df.set_index('CommentId', inplace=True)
comments_on_answers_df = pd.read_sql_query(f""" comments_on_answers_df = pd.read_sql_query(f"""
SELECT A.Tags, C.* SELECT A.Tags, C.*
FROM Post A FROM Post A
INNER JOIN Post B ON (B.ParentId = A.PostId) AND (B.ParentId IS NOT NULL) INNER JOIN Post B ON (B.ParentId = A.PostId) AND (B.ParentId IS NOT NULL)
INNER JOIN Comment C ON (B.PostId = C.PostId) INNER JOIN Comment C ON (B.PostId = C.PostId)
WHERE A.Tags LIKE '%python%' AND (C.UserId = {user_id}) AND (B.PostTypeId = 2) WHERE A.Tags LIKE '%python%' AND (C.UserId = {user_id}) AND (B.PostTypeId = 2)
""", db) """, db)
comments_on_answers_df.set_index('CommentId', inplace=True) comments_on_answers_df.set_index('CommentId', inplace=True)
return pd.concat([comments_on_questions_df, comments_on_answers_df]) return pd.concat([comments_on_questions_df, comments_on_answers_df])
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def construct_graph(user_id, db): def construct_graph(user_id, db):
graph_constructor = StaticGraphConstruction() graph_constructor = StaticGraphConstruction()
qs = fetch_questions_by_user(user_id, db) qs = fetch_questions_by_user(user_id, db)
ans = fetch_answers_by_user(user_id, db) ans = fetch_answers_by_user(user_id, db)
cs = fetch_comments_by_user(user_id, db) cs = fetch_comments_by_user(user_id, db)
return graph_constructor.construct(questions=qs, answers=ans, comments=cs) return graph_constructor.construct(questions=qs, answers=ans, comments=cs)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import torch
from datetime import date from datetime import date
from post_embedding_builder import PostEmbedding from post_embedding_builder import PostEmbedding
from static_graph_construction import StaticGraphConstruction from static_graph_construction import StaticGraphConstruction
import pickle import pickle
post_embedding_builder = PostEmbedding() post_embedding_builder = PostEmbedding()
FILE_BATCH_SIZE = 10 FILE_BATCH_SIZE = 10
data = {"graphs":[], "questions": [], "answers": [], "labels": []} data = {"graphs":[], "questions": [], "answers": [], "labels": []}
question_c = 1 question_c = 1
answer_c = 1 answer_c = 1
for row in valid_questions.itertuples(): for row in valid_questions.itertuples():
log.info(f"processing question {question_c}") log.info(f"processing question {question_c}")
# Build Question embedding # Build Question embedding
question_emb = post_embedding_builder(row.question_body, use_bert=True, title=row.question_title) question_word_emb, question_code_emb, _ = post_embedding_builder(
row.question_body,
use_bert=True,
title=row.question_title
)
question_emb = torch.concat((question_word_emb, question_code_emb))
# Build Answer embeddings # Build Answer embeddings
for _, answer_body, answer_user_id, score in fetch_answers_for_question(row.post_id, db).itertuples(): for _, answer_body, answer_user_id, score in fetch_answers_for_question(row.post_id, db).itertuples():
log.info(f"processing answer {answer_c}") log.info(f"processing answer {answer_c}")
ans_emb = post_embedding_builder(answer_body, use_bert=True) answer_word_emb, answer_code_emb, _ = post_embedding_builder(
answer_body, use_bert=True
)
answer_emb = torch.concat((answer_word_emb, answer_code_emb))
# Construct User Graph # Construct User Graph
ug = construct_graph(answer_user_id, db) ug = construct_graph(answer_user_id, db)
data["graphs"].append(ug) data["graphs"].append(ug)
data["questions"].append(question_emb) data["questions"].append(question_emb)
data["answers"].append(ans_emb) data["answers"].append(answer_emb)
data["labels"].append(1 if score > 0 else 0) data["labels"].append(1 if score > 0 else 0)
# Dataset will grow larger than memory, so batch in pickle files # Dataset will grow larger than memory, so batch in pickle files
with open(f"../data/raw/batch{date.today():%m-%d-%Y}.pkl", "wb") as f: with open(f"../data/raw/batch{date.today():%m-%d-%Y}.pkl", "wb") as f:
pickle.dump(data, f) pickle.dump(data, f)
data = {"graphs":[], "questions": [], "answers": [], "labels": []} data = {"graphs":[], "questions": [], "answers": [], "labels": []}
answer_c += 1 answer_c += 1
question_c += 1 question_c += 1
answer_c = 1 answer_c = 1
``` ```
%% Output %% Output
INFO:post_embedding_builder:PostEmbedding instantiated! INFO:post_embedding_builder:PostEmbedding instantiated!
INFO:torchtext.vocab.vectors:Loading vectors from .vector_cache\glove.840B.300d.txt.pt INFO:torchtext.vocab.vectors:Loading vectors from .vector_cache\glove.840B.300d.txt.pt
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias'] Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
INFO:post_embedding_builder:PostEmbedding instantiated! INFO:post_embedding_builder:PostEmbedding instantiated!
INFO:torchtext.vocab.vectors:Loading vectors from .vector_cache\glove.840B.300d.txt.pt INFO:torchtext.vocab.vectors:Loading vectors from .vector_cache\glove.840B.300d.txt.pt
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias'] Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
INFO:training-set-builder:processing question 1 INFO:training-set-builder:processing question 1
INFO:training-set-builder:processing answer 1 INFO:training-set-builder:processing answer 1
INFO:training-set-builder:processing answer 2 INFO:training-set-builder:processing answer 2
INFO:training-set-builder:processing answer 3 INFO:training-set-builder:processing answer 3
INFO:training-set-builder:processing answer 4 INFO:training-set-builder:processing answer 4
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 5 INFO:training-set-builder:processing answer 5
INFO:training-set-builder:processing answer 6 INFO:training-set-builder:processing answer 6
INFO:training-set-builder:processing answer 7 INFO:training-set-builder:processing answer 7
INFO:training-set-builder:processing answer 8 INFO:training-set-builder:processing answer 8
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 9 INFO:training-set-builder:processing answer 9
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 10 INFO:training-set-builder:processing answer 10
INFO:training-set-builder:writing batch to file INFO:training-set-builder:writing batch to file
INFO:training-set-builder:processing answer 11 INFO:training-set-builder:processing answer 11
Token indices sequence length is longer than the specified maximum sequence length for this model (1056 > 512). Running this sequence through the model will result in indexing errors Token indices sequence length is longer than the specified maximum sequence length for this model (1056 > 512). Running this sequence through the model will result in indexing errors
INFO:training-set-builder:processing question 2 INFO:training-set-builder:processing question 2
INFO:training-set-builder:processing answer 1 INFO:training-set-builder:processing answer 1
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 2 INFO:training-set-builder:processing answer 2
INFO:training-set-builder:processing answer 3 INFO:training-set-builder:processing answer 3
INFO:training-set-builder:processing answer 4 INFO:training-set-builder:processing answer 4
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing question 3 INFO:training-set-builder:processing question 3
INFO:training-set-builder:processing answer 1 INFO:training-set-builder:processing answer 1
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 2 INFO:training-set-builder:processing answer 2
INFO:training-set-builder:processing answer 3 INFO:training-set-builder:processing answer 3
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing question 4 INFO:training-set-builder:processing question 4
INFO:training-set-builder:processing answer 1 INFO:training-set-builder:processing answer 1
INFO:training-set-builder:processing answer 2 INFO:training-set-builder:processing answer 2
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:404: MarkupResemblesLocatorWarning: The input looks more like a URL than markup. You may want to use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:404: MarkupResemblesLocatorWarning: The input looks more like a URL than markup. You may want to use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
warnings.warn( warnings.warn(
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:writing batch to file INFO:training-set-builder:writing batch to file
INFO:training-set-builder:processing answer 3 INFO:training-set-builder:processing answer 3
INFO:training-set-builder:processing answer 4 INFO:training-set-builder:processing answer 4
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 5 INFO:training-set-builder:processing answer 5
INFO:training-set-builder:processing answer 6 INFO:training-set-builder:processing answer 6
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing question 5 INFO:training-set-builder:processing question 5
INFO:training-set-builder:processing answer 1 INFO:training-set-builder:processing answer 1
INFO:training-set-builder:processing answer 2 INFO:training-set-builder:processing answer 2
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 3 INFO:training-set-builder:processing answer 3
INFO:training-set-builder:processing question 6 INFO:training-set-builder:processing question 6
INFO:training-set-builder:processing answer 1 INFO:training-set-builder:processing answer 1
INFO:training-set-builder:processing answer 2 INFO:training-set-builder:processing answer 2
INFO:training-set-builder:processing answer 3 INFO:training-set-builder:processing answer 3
INFO:training-set-builder:writing batch to file INFO:training-set-builder:writing batch to file
INFO:training-set-builder:processing answer 4 INFO:training-set-builder:processing answer 4
INFO:training-set-builder:processing answer 5 INFO:training-set-builder:processing answer 5
INFO:training-set-builder:processing answer 6 INFO:training-set-builder:processing answer 6
INFO:training-set-builder:processing answer 7 INFO:training-set-builder:processing answer 7
INFO:training-set-builder:processing answer 8 INFO:training-set-builder:processing answer 8
INFO:training-set-builder:processing question 7 INFO:training-set-builder:processing question 7
INFO:training-set-builder:processing answer 1 INFO:training-set-builder:processing answer 1
INFO:training-set-builder:processing answer 2 INFO:training-set-builder:processing answer 2
INFO:training-set-builder:processing answer 3 INFO:training-set-builder:processing answer 3
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 4 INFO:training-set-builder:processing answer 4
INFO:training-set-builder:processing answer 5 INFO:training-set-builder:processing answer 5
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:writing batch to file INFO:training-set-builder:writing batch to file
INFO:training-set-builder:processing answer 6 INFO:training-set-builder:processing answer 6
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 7 INFO:training-set-builder:processing answer 7
INFO:training-set-builder:processing answer 8 INFO:training-set-builder:processing answer 8
INFO:training-set-builder:processing question 8 INFO:training-set-builder:processing question 8
INFO:training-set-builder:processing answer 1 INFO:training-set-builder:processing answer 1
INFO:training-set-builder:processing answer 2 INFO:training-set-builder:processing answer 2
INFO:training-set-builder:processing answer 3 INFO:training-set-builder:processing answer 3
INFO:training-set-builder:processing answer 4 INFO:training-set-builder:processing answer 4
INFO:training-set-builder:processing answer 5 INFO:training-set-builder:processing answer 5
INFO:training-set-builder:processing answer 6 INFO:training-set-builder:processing answer 6
INFO:training-set-builder:processing question 9 INFO:training-set-builder:processing question 9
INFO:training-set-builder:processing answer 1 INFO:training-set-builder:processing answer 1
INFO:training-set-builder:writing batch to file INFO:training-set-builder:writing batch to file
INFO:training-set-builder:processing answer 2 INFO:training-set-builder:processing answer 2
INFO:training-set-builder:processing answer 3 INFO:training-set-builder:processing answer 3
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 4 INFO:training-set-builder:processing answer 4
INFO:training-set-builder:processing answer 5 INFO:training-set-builder:processing answer 5
INFO:training-set-builder:processing answer 6 INFO:training-set-builder:processing answer 6
INFO:training-set-builder:processing answer 7 INFO:training-set-builder:processing answer 7
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 8 INFO:training-set-builder:processing answer 8
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 9 INFO:training-set-builder:processing answer 9
INFO:training-set-builder:processing answer 10 INFO:training-set-builder:processing answer 10
INFO:training-set-builder:processing answer 11 INFO:training-set-builder:processing answer 11
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:writing batch to file INFO:training-set-builder:writing batch to file
INFO:training-set-builder:processing answer 12 INFO:training-set-builder:processing answer 12
INFO:training-set-builder:processing answer 13 INFO:training-set-builder:processing answer 13
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 14 INFO:training-set-builder:processing answer 14
INFO:training-set-builder:processing question 10 INFO:training-set-builder:processing question 10
INFO:training-set-builder:processing answer 1 INFO:training-set-builder:processing answer 1
INFO:training-set-builder:processing answer 2 INFO:training-set-builder:processing answer 2
INFO:training-set-builder:processing answer 3 INFO:training-set-builder:processing answer 3
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 4 INFO:training-set-builder:processing answer 4
INFO:training-set-builder:processing answer 5 INFO:training-set-builder:processing answer 5
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 6 INFO:training-set-builder:processing answer 6
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 7 INFO:training-set-builder:processing answer 7
Token indices sequence length is longer than the specified maximum sequence length for this model (879 > 512). Running this sequence through the model will result in indexing errors Token indices sequence length is longer than the specified maximum sequence length for this model (879 > 512). Running this sequence through the model will result in indexing errors
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:writing batch to file INFO:training-set-builder:writing batch to file
INFO:training-set-builder:processing answer 8 INFO:training-set-builder:processing answer 8
INFO:training-set-builder:processing answer 9 INFO:training-set-builder:processing answer 9
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:404: MarkupResemblesLocatorWarning: The input looks more like a URL than markup. You may want to use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:404: MarkupResemblesLocatorWarning: The input looks more like a URL than markup. You may want to use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 10 INFO:training-set-builder:processing answer 10
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 11 INFO:training-set-builder:processing answer 11
INFO:training-set-builder:processing answer 12 INFO:training-set-builder:processing answer 12
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 13 INFO:training-set-builder:processing answer 13
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 14 INFO:training-set-builder:processing answer 14
INFO:training-set-builder:processing answer 15 INFO:training-set-builder:processing answer 15
INFO:training-set-builder:processing answer 16 INFO:training-set-builder:processing answer 16
INFO:training-set-builder:processing answer 17 INFO:training-set-builder:processing answer 17
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:writing batch to file INFO:training-set-builder:writing batch to file
INFO:training-set-builder:processing answer 18 INFO:training-set-builder:processing answer 18
INFO:training-set-builder:processing answer 19 INFO:training-set-builder:processing answer 19
INFO:training-set-builder:processing question 11 INFO:training-set-builder:processing question 11
INFO:training-set-builder:processing answer 1 INFO:training-set-builder:processing answer 1
INFO:training-set-builder:processing answer 2 INFO:training-set-builder:processing answer 2
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 3 INFO:training-set-builder:processing answer 3
INFO:training-set-builder:processing answer 4 INFO:training-set-builder:processing answer 4
INFO:training-set-builder:processing answer 5 INFO:training-set-builder:processing answer 5
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 6 INFO:training-set-builder:processing answer 6
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 7 INFO:training-set-builder:processing answer 7
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing question 12 INFO:training-set-builder:processing question 12
INFO:training-set-builder:processing answer 1 INFO:training-set-builder:processing answer 1
INFO:training-set-builder:writing batch to file INFO:training-set-builder:writing batch to file
INFO:training-set-builder:processing answer 2 INFO:training-set-builder:processing answer 2
INFO:training-set-builder:processing answer 3 INFO:training-set-builder:processing answer 3
INFO:training-set-builder:processing answer 4 INFO:training-set-builder:processing answer 4
INFO:training-set-builder:processing answer 5 INFO:training-set-builder:processing answer 5
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 6 INFO:training-set-builder:processing answer 6
INFO:training-set-builder:processing answer 7 INFO:training-set-builder:processing answer 7
C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup. C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
warnings.warn( warnings.warn(
INFO:training-set-builder:processing answer 8 INFO:training-set-builder:processing answer 8
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
X_raw[0] X_raw[0]
``` ```
%% Output %% Output
--------------------------------------------------------------------------- ---------------------------------------------------------------------------
NameError Traceback (most recent call last) NameError Traceback (most recent call last)
Cell In [10], line 1 Cell In [10], line 1
----> 1 X_raw[0] ----> 1 X_raw[0]
NameError: name 'X_raw' is not defined NameError: name 'X_raw' is not defined
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
X_raw[10] X_raw[10]
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment