Skip to content
Snippets Groups Projects
Commit 2e26cfaf authored by Liam Byrne's avatar Liam Byrne
Browse files

Dataset implemented

parent 8afc2c7a
Branches
No related tags found
No related merge requests found
import torch
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GATConv, Linear, MeanAggregation, to_hetero
from dataset import UserGraphDataset
class GAT(torch.nn.Module):
def __init__(self, hidden_channels):
super(GAT, self).__init__()
torch.manual_seed(12345)
self.conv1 = GATConv((-1, -1), hidden_channels, add_self_loops=False)
self.conv2 = GATConv((-1, -1), hidden_channels, add_self_loops=False)
self.conv3 = GATConv((-1, -1), hidden_channels, add_self_loops=False)
self.lin = Linear(hidden_channels, 3)
self.pool = MeanAggregation()
def forward(self, x, edge_index, batch, post_emb):
# 1. Obtain node embeddings
x = self.conv1(x, edge_index)
x = x.relu()
x = self.conv2(x, edge_index)
x = x.relu()
x = self.conv3(x, edge_index)
# 2. Readout layer
x = self.pool(x, batch) # [batch_size, hidden_channels]
x = torch.concat([x, post_emb])
# 3. Apply a final classifier
#x = F.dropout(x, p=0.5, training=self.training)
x = self.lin(x)
return x
def train(model, train_loader):
model.train()
for data in train_loader: # Iterate in batches over the training dataset.
out = model(data.x, data.edge_index, data.batch) # Perform a single forward pass.
loss = criterion(out, data.y) # Compute the loss.
loss.backward() # Derive gradients.
optimizer.step() # Update parameters based on gradients.
optimizer.zero_grad() # Clear gradients.
def test(loader):
model.eval()
correct = 0
for data in loader: # Iterate in batches over the training/test dataset.
out = model(data.x, data.edge_index, data.batch)
pred = out.argmax(dim=1) # Use the class with highest probability.
correct += int((pred == data.y).sum()) # Check against ground-truth labels.
return correct / len(loader.dataset) # Derive ratio of correct predictions.
if __name__ == '__main__':
dataset = UserGraphDataset(root="data")
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [0.6, 0.1, 0.3])
train_loader = DataLoader(train_dataset, batch_size=64)
val_loader = DataLoader(val_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)
model = GAT(hidden_channels=64)
example_graph = train_loader[0]
print(example_graph)
model = to_hetero(model, example_graph.metadata())
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()
for epoch in range(1, 10):
train(model, train_loader)
train_acc = test(train_loader)
print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')
import itertools
import logging
import random
import sqlite3
from typing import *
import pandas as pd
import torch
from bs4 import BeautifulSoup
from torch import nn, optim
from torch.functional import F
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from post_embedding_builder import PostEmbedding
logging.basicConfig(level=logging.INFO)
class ModuleEmbeddingTrainer:
def __init__(self, emb_size: int, database_path: str = None):
logger = logging.getLogger(self.__class__.__name__)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Proceeding with {self.device} . .")
if database_path is not None:
self.db = sqlite3.connect(database_path)
logger.info(f"Connected to {database_path}")
self.emb_size = emb_size
self.emb_builder = PostEmbedding()
def from_files(self, post_tags_path: str, tag_vocab: str):
pass
def from_db(self):
post_body_series = pd.read_sql_query("SELECT Body FROM Post WHERE (Tags LIKE '%python%') AND (Body LIKE '%import%') LIMIT 100000", self.db)
modules_series = post_body_series['Body'].apply(lambda html: [x.module for x in self.emb_builder.get_imports_via_regex(BeautifulSoup(html, 'lxml'))])
self.module_vocab = list(set(modules_series.sum()))
combinations = modules_series.apply(lambda row: list(itertools.combinations(row, 2)))
combinations = combinations[combinations.astype(str) != '[]']
# Now concatenate all the lists together
module_pairs = []
for i in combinations:
module_pairs += i
self.training_pairs = module_pairs
def sample_n(self, df, train_size: int):
return random.sample(df, train_size)
def train(self, train_size: int, epochs: int):
# Loss
loss_function = nn.NLLLoss()
losses = []
# Model
self.model = ModuleEmbedding(vocab_size=len(self.module_vocab), embedding_dim=self.emb_size).to(self.device)
# Optimizer
optimizer = optim.SGD(self.model.parameters(), lr=0.001)
# Enumerate the vocabulary, reflects the index of where the 1 is in the one-hot
self.tag_to_ix = {tag: i for i, tag in enumerate(self.module_vocab)}
# Reduce size of training set
samples = self.sample_n(self.training_pairs, train_size)
for epoch in range(epochs):
total_loss = 0
for tag_a, tag_b in tqdm(samples):
tag_a_id = torch.tensor(self.tag_to_ix[tag_a], dtype=torch.long).to(self.device)
self.model.zero_grad()
log_probs = self.model(tag_a_id)
loss = loss_function(log_probs.flatten(), torch.tensor(self.tag_to_ix[tag_b], dtype=torch.long).to(self.device))
loss.backward()
optimizer.step()
total_loss += loss.item()
losses.append(total_loss)
def get_tag_embedding(self, tag: str):
return self.model.embedding.weight[self.tag_to_ix[tag]]
def to_tensorboard(self, run_name: str):
"""
Write embedding to Tensorboard projector
tensorboard --logdir="runs/run@20221102-173048"
"""
writer = SummaryWriter(f'runs/{run_name}')
writer.add_embedding(self.model.embedding.weight,
metadata=self.module_vocab,
tag=f'Next-Tag embedding')
writer.close()
def load_model(self, model_path: str, vocab_size: int, embedding_dim: int):
self.model = ModuleEmbedding(vocab_size, embedding_dim)
self.model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
def save_model(self, model_path: str):
torch.save(self.model.state_dict(), model_path)
class ModuleEmbedding(nn.Module):
def __init__(self, vocab_size, embedding_dim):
super(ModuleEmbedding, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.linear1 = nn.Linear(embedding_dim, 128)
self.linear2 = nn.Linear(128, vocab_size)
def forward(self, inputs):
embeds = self.embedding(inputs).view((1, -1))
out = F.relu(self.linear1(embeds))
out = self.linear2(out)
log_probs = F.log_softmax(out, dim=1)
return log_probs
if __name__ == '__main__':
met = ModuleEmbeddingTrainer(emb_size=30, database_path='../stackoverflow.db')
met.from_db()
print(len(met.training_pairs))
print(len(met.module_vocab))
#tet = NextTagEmbeddingTrainer(context_length=3, emb_size=50)
#tet.from_files("../data/raw/all_tags.csv", "../data/raw/tag_vocab.csv")
# assert len(tet.post_tags) == 84187510, "Incorrect number of post tags!"
# assert len(tet.tag_vocab) == 63653, "Incorrect vocab size!"
met.train(1000, 1)
# tet.to_tensorboard(f"run@{time.strftime('%Y%m%d-%H%M%S')}")
# tet.save_model("25mil.pt")
# tet.load_model("10mil_500d_embd.pt", 63653, 500)
# tet.to_tensorboard(f"run@{time.strftime('%Y%m%d-%H%M%S')}")
\ No newline at end of file
......@@ -16,7 +16,7 @@ logging.basicConfig(level=logging.INFO)
class NextTagEmbeddingTrainer:
def __init__(self, context_length: int, emb_size: int, database_path: str = None):
def __init__(self, context_length: int, emb_size: int, excluded_tags=None, database_path: str = None):
logger = logging.getLogger(self.__class__.__name__)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Proceeding with {self.device} . .")
......@@ -27,14 +27,20 @@ class NextTagEmbeddingTrainer:
self.post_tags: List[Tuple]
self.context_length = context_length
self.emb_size = emb_size
self.excluded_tags = excluded_tags
def build_cbow(self, tags: List[str], context_len: int) -> List[Tuple]:
pairs = []
if len(tags) <= 1:
filtered_tags = [t for t in tags if t not in self.excluded_tags]
if len(filtered_tags) <= 1:
return []
for target in tags:
context = [t for t in tags if t != target]
pairs = []
for target in filtered_tags:
context = [t for t in filtered_tags if t != target]
# Pad or cut depending on the context length
while len(context) < context_len:
context.append('PAD')
......@@ -58,17 +64,16 @@ class NextTagEmbeddingTrainer:
self.post_tags = tag_pairs
def from_db(self):
tag_df = pd.read_sql_query("SELECT * FROM Tag", self.db)
tag_df.set_index('TagId', inplace=True)
self.tag_vocab = list(set(tag_df["TagName"]))
post_tags = pd.read_sql_query(f"SELECT Tags FROM Post WHERE PostTypeId=1", self.db)
post_tags = pd.read_sql_query(f"SELECT Tags FROM Post WHERE PostTypeId=1 AND Tags LIKE '%python%' LIMIT 100000", self.db)
tag_list_df = post_tags['Tags'].map(self.parse_tag_list)
combinations = tag_list_df.apply(lambda row: list(itertools.combinations(row, 2)))
combinations = combinations[combinations.astype(str) != '[]']
self.tag_vocab = list(set(tag_list_df.sum() + ["PAD"]))
context_and_target = tag_list_df.apply(lambda row: self.build_cbow(row, self.context_length))
context_and_target = context_and_target[context_and_target.astype(str) != '[]']
# Now concatenate all the lists together
tag_pairs = []
for i in combinations:
for i in context_and_target:
tag_pairs += i
self.post_tags = tag_pairs
......@@ -143,15 +148,19 @@ class NextTagEmbedding(nn.Module):
if __name__ == '__main__':
# tet = TagEmbeddingTrainer("../stackoverflow.db")
# tet.from_db()
tet = NextTagEmbeddingTrainer(context_length=3, emb_size=30)
tet = NextTagEmbeddingTrainer(context_length=2, emb_size=30, excluded_tags=['python'], database_path="../stackoverflow.db")
tet.from_db()
print(len(tet.post_tags))
print(len(tet.tag_vocab))
#tet = NextTagEmbeddingTrainer(context_length=3, emb_size=50)
tet.from_files("../data/raw/all_tags.csv", "../data/raw/tag_vocab.csv")
#tet.from_files("../data/raw/all_tags.csv", "../data/raw/tag_vocab.csv")
# assert len(tet.post_tags) == 84187510, "Incorrect number of post tags!"
# assert len(tet.tag_vocab) == 63653, "Incorrect vocab size!"
tet.train(100, 1)
tet.train(1000, 1)
# tet.to_tensorboard(f"run@{time.strftime('%Y%m%d-%H%M%S')}")
# tet.save_model("25mil.pt")
......
No preview for this file type
File added
import logging
import os.path
import pickle
import sqlite3
import pandas as pd
import torch
from bs4 import MarkupResemblesLocatorWarning
from torch_geometric.data import Dataset, download_url, Data
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning)
from post_embedding_builder import PostEmbedding
from static_graph_construction import StaticGraphConstruction
logging.basicConfig()
#logging.getLogger().setLevel(logging.ERROR)
log = logging.getLogger("dataset")
class UserGraphDataset(Dataset):
def __init__(self, root, transform=None, pre_transform=None, pre_filter=None, db_address:str=None, question_count=70000):
self._question_count = question_count
# Connect to database.
if db_address is not None:
self._db = sqlite3.connect(db_address)
self._post_embedding_builder = PostEmbedding()
super().__init__(root, transform, pre_transform, pre_filter)
@property
def raw_file_names(self):
return []
@property
def processed_file_names(self):
return os.listdir("data/processed")
def download(self):
pass
def process(self):
idx = 0
valid_questions = self.fetch_valid_questions()
for row in tqdm(valid_questions.itertuples(), total=len(valid_questions)):
# Build Question embedding
question_emb = self._post_embedding_builder(
row.question_body,
use_bert=True,
title=row.question_title
)
answers_to_question = self.fetch_answers_for_question(row.post_id)
# Build Answer embeddings
for _, answer_body, answer_user_id, score in answers_to_question.itertuples():
answer_emb = self._post_embedding_builder(
answer_body,
use_bert=True
)
# Build graph
graph = self.construct_graph(answer_user_id)
# pytorch geometric data object
data = Data(
x=graph.x_dict,
edge_index=graph.edge_index_dict,
y=torch.LongTensor(1 if score > 0 else 0),
question_emb=question_emb,
answer_emb=answer_emb
)
torch.save(data, os.path.join(self.processed_dir, f'data_{idx}.pt'))
idx += 1
def len(self):
return len(self.processed_file_names)
def get(self, idx):
data = torch.load(os.path.join(self.processed_dir, f'data_{idx}.pt'))
return data
'''
Database functions
'''
def fetch_valid_questions(self):
valid_questions = pd.read_sql_query(f"""
SELECT Q.PostId, Q.Body, Q.Title, Q.OwnerUserId FROM Post Q
INNER JOIN Post A ON Q.PostId = A.ParentId
WHERE (Q.Tags LIKE '%<python>%')
GROUP BY A.ParentId
HAVING SUM(A.Score) > 15
LIMIT {self._question_count}
""", self._db)
valid_questions.columns = ['post_id', 'question_body', 'question_title', 'question_user_id']
return valid_questions
def fetch_questions_by_user(self, user_id: int):
questions_df = pd.read_sql_query(f"""
SELECT *
FROM Post
WHERE Tags LIKE '%python%' AND (PostTypeId = 1) AND ((LastEditorUserId = {user_id}) OR (OwnerUserId = {user_id}))
""", self._db)
questions_df.set_index('PostId', inplace=True)
return questions_df
def fetch_answers_by_user(self, user_id: int):
answers_df = pd.read_sql_query(f"""
SELECT A.Tags, B.*
FROM Post A
INNER JOIN Post B ON (B.ParentId = A.PostId) AND (B.ParentId IS NOT NULL)
WHERE A.Tags LIKE '%python%' AND (B.PostTypeId = 2) AND ((B.LastEditorUserId = {user_id}) OR (B.OwnerUserId = {user_id}))
""", self._db)
answers_df = answers_df.loc[:, ~answers_df.columns.duplicated()].copy()
answers_df.set_index('PostId', inplace=True)
return answers_df
def fetch_answers_for_question(self, question_post_id: int):
answers_df = pd.read_sql_query(f"""
SELECT Body, OwnerUserId, Score
FROM Post
WHERE ParentId = {question_post_id}
""", self._db)
answers_df = answers_df.dropna()
return answers_df
def fetch_comments_by_user(self, user_id: int):
comments_on_questions_df = pd.read_sql_query(f"""
SELECT A.Tags, B.*
FROM Post A
INNER JOIN Comment B ON (B.PostId = A.PostId)
WHERE A.Tags LIKE '%python%' AND (B.UserId = {user_id}) AND (A.PostTypeId = 1)
""", self._db)
comments_on_questions_df.set_index('CommentId', inplace=True)
comments_on_answers_df = pd.read_sql_query(f"""
SELECT A.Tags, C.*
FROM Post A
INNER JOIN Post B ON (B.ParentId = A.PostId) AND (B.ParentId IS NOT NULL)
INNER JOIN Comment C ON (B.PostId = C.PostId)
WHERE A.Tags LIKE '%python%' AND (C.UserId = {user_id}) AND (B.PostTypeId = 2)
""", self._db)
comments_on_answers_df.set_index('CommentId', inplace=True)
return pd.concat([comments_on_questions_df, comments_on_answers_df])
def construct_graph(self, user_id: int):
graph_constructor = StaticGraphConstruction()
qs = self.fetch_questions_by_user(user_id)
ans = self.fetch_answers_by_user(user_id)
cs = self.fetch_comments_by_user(user_id)
return graph_constructor.construct(questions=qs, answers=ans, comments=cs)
if __name__ == '__main__':
ds = UserGraphDataset('../data/', db_address='../stackoverflow.db', question_count=100)
print(ds.get(0))
\ No newline at end of file
import ast, astunparse
import ast
import io
import time
import logging
import re
import tokenize
from collections import namedtuple
from typing import List
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
log = logging.getLogger(__name__)
from bs4 import BeautifulSoup
import spacy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.vocab import GloVe
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
......@@ -25,7 +29,8 @@ class PostEmbedding(nn.Module):
def __init__(self):
super().__init__()
self._global_vectors = GloVe(name='840B', dim=300)
log.info("PostEmbedding instantiated!")
#self._global_vectors = GloVe(name='840B', dim=300)
self._en = spacy.load('en_core_web_sm')
self._stopwords = self._en.Defaults.stop_words
self._bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
......@@ -33,7 +38,7 @@ class PostEmbedding(nn.Module):
self._code_bert_tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
self._code_bert_model = AutoModel.from_pretrained("microsoft/codebert-base")
def forward(self, html: str, title: str=None, flatten=True) -> torch.tensor:
def forward(self, html: str, use_bert: bool, title: str=None, flatten=True) -> torch.tensor:
"""
@param html: HTML string of the body of a StackOverflow post.
@param title: Title of a question post.
......@@ -43,12 +48,16 @@ class PostEmbedding(nn.Module):
soup = BeautifulSoup(html, 'lxml')
ps = self.get_paragraphs(soup, title)
if use_bert:
para_emb = self.to_bert_embedding(" ".join(ps))
else:
para_emb = self.to_glove_paragraph_embedding(ps)
code = self.get_code(soup)
modules, funcs = self.get_code(soup, get_imports_with_regex=True)
code_bert = self.to_code_bert_embedding("\n".join([x.get_text() for x in soup.find_all('code')]))
return para_emb, code
return para_emb, code_bert, modules
def preprocess(self, text: str) -> List[str]:
"""
......@@ -72,7 +81,7 @@ class PostEmbedding(nn.Module):
paras.append(self.preprocess(title))
return [token for para in paras for token in para]
def get_code(self, soup: BeautifulSoup) -> (List[Import], List[Function]):
def get_code(self, soup: BeautifulSoup, get_imports_with_regex=False, get_functions_with_regex=False) -> (List[Import], List[Function]):
"""
@param soup: Post body HTML wrapped in a BeautifulSoup object.
@return: Combined string of code snippets
......@@ -82,8 +91,15 @@ class PostEmbedding(nn.Module):
syntax_tree = ast.parse(code_snippet)
except SyntaxError:
return ([],[])
modules = list(self.get_imports(syntax_tree))
function_defs = list(self.get_function(syntax_tree))
if get_imports_with_regex:
modules = list(self.get_imports_via_regex(soup))
else:
modules = list(self.get_imports_via_ast(syntax_tree))
if get_functions_with_regex:
raise NotImplementedError("RegEx implementation for function names not implemented yet . .")
else:
function_defs = list(self.get_function_via_ast(syntax_tree))
return modules, function_defs
def to_glove_paragraph_embedding(self, tokens: List[str]) -> torch.tensor:
......@@ -98,10 +114,13 @@ class PostEmbedding(nn.Module):
def to_bert_embedding(self, text: str) -> torch.tensor:
sentences = [i.text for i in self._en(text).sents]
encodings = self._tokenizer(sentences, padding=True, return_tensors='pt')
if not len(sentences):
return torch.zeros(768)
encodings = self._bert_tokenizer(sentences, padding=True, truncation=True, return_tensors='pt', max_length=512)
with torch.no_grad():
embeds = self._bert_model(**encodings)
return embeds.mean(dim=1).mean(dim=0)
outputs = self._bert_model(**encodings, output_hidden_states=True)
cls = outputs.hidden_states[-1][0,0,:]
return cls
def to_code_bert_embedding(self, code):
......@@ -112,30 +131,61 @@ class PostEmbedding(nn.Module):
"""
# First, get the comments from the Python code (NL)
buf = io.StringIO(code)
comments = [line.string for line in tokenize.generate_tokens(buf.readline) if line.type == tokenize.COMMENT]
comments = " ".join(comments)
print(comments)
source = []
comments = []
token_gen = tokenize.generate_tokens(buf.readline)
while True:
try:
token = next(token_gen)
if token.type == tokenize.COMMENT:
comments.append(token.string)
else:
source.append(token.string)
except tokenize.TokenError:
continue
except StopIteration:
break
except IndentationError:
continue
nl_tokens = self._code_bert_tokenizer.tokenize(comments)
nl_tokens = self._code_bert_tokenizer.tokenize(" ".join(comments))
syntax_tree = ast.parse(code)
uncommented = astunparse.unparse(syntax_tree)
code_tokens = self._code_bert_tokenizer.tokenize(uncommented)
code_tokens = self._code_bert_tokenizer.tokenize("".join(source))
# CodeBERT has a max token length of 512
while len(nl_tokens) + len(code_tokens) > 509:
if len(nl_tokens) > len(code_tokens):
nl_tokens = nl_tokens[:-1]
else:
code_tokens = code_tokens[:-1]
tokens = [self._code_bert_tokenizer.cls_token] + nl_tokens + [self._code_bert_tokenizer.sep_token] + code_tokens + [self._code_bert_tokenizer.eos_token]
tokens_ids = self._code_bert_tokenizer.convert_tokens_to_ids(tokens)
print(len(tokens))
return self._code_bert_model(torch.tensor(tokens_ids)[None,:])[0]
emb = self._code_bert_model(torch.tensor(tokens_ids)[None,:])[0]
return emb.mean(dim=1).mean(dim=0)
"""
Python RegEx methods
"""
def get_imports_via_regex(self, soup) -> Import:
code_snippet = "\n".join([x.get_text() for x in soup.find_all('code')])
PATTERN = r'^\s*(?:from|import)\s+(\w+(?:\s*,\s*\w+)*)'
for module in list(set(re.findall(PATTERN, code_snippet, flags=re.MULTILINE))):
yield Import(module, None, None)
"""
Python Abstract Syntax Tree methods
"""
def get_imports(self, syntax_tree) -> Import:
def get_imports_via_ast(self, syntax_tree) -> Import:
"""
@param code_snippet:
@return:
......@@ -151,7 +201,7 @@ class PostEmbedding(nn.Module):
for n in node.names:
yield Import(module, n.name.split('.'), n.asname)
def get_function(self, syntax_tree) -> Function:
def get_function_via_ast(self, syntax_tree) -> Function:
"""
@param code_snippet:
@return:
......@@ -164,4 +214,6 @@ class PostEmbedding(nn.Module):
if __name__ == '__main__':
pe = PostEmbedding()
print(pe.to_code_bert_embedding("def a(self: int) -> Function: #hello\n a+2\n return a").shape)
#print(pe.to_code_bert_embedding("\n".join(["for i in range(32):\n #return 6 or something\n"])).shape)
print(pe.to_bert_embedding("This is a test sentence."))
#print([x.module for x in pe.get_imports_via_regex(BeautifulSoup("<code>import ast<\code>", 'lxml'))])
This diff is collapsed.
from typing import List
import logging
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
log = logging.getLogger(__name__)
import pandas as pd
import torch
from torch_geometric.data import HeteroData
from post_embedding_builder import Import, PostEmbedding
class StaticGraphConstruction:
# PostEmbedding is costly to put in constructor
post_embedding_builder = PostEmbedding()
def __init__(self):
self._known_tags = {} # tag_name -> index
self._known_modules = {} # module_name -> index
self._data = HeteroData()
self._first_n_tags = 3
self._tag_to_question_edges = []
self._tag_to_answer_edges = []
self._tag_to_comment_edges = []
self._module_to_question_edges = []
self._module_to_answer_edges = []
self._module_to_comment_edges = []
self._use_bert = True
self._post_count_limit = 10
def process_questions(self, questions: pd.DataFrame) -> torch.Tensor:
for i, body, title, tags in questions[['Body', 'Title', 'Tags']].itertuples():
word_embedding, code_embedding, modules = StaticGraphConstruction.post_embedding_builder(body, self._use_bert, title)
modules = self.process_module_names(modules)
tag_list = self.parse_tag_list(tags)[:self._first_n_tags]
for tag in tag_list:
self._tag_to_question_edges.append((self._known_tags[tag], i))
for module in modules:
self._module_to_question_edges.append((self._known_modules[module], i))
yield torch.concat((word_embedding, code_embedding))
def process_answers(self, answers: pd.DataFrame) -> torch.Tensor:
for i, body, title, tags in answers[['Body', 'Title', 'Tags']].itertuples():
word_embedding, code_embedding, modules = StaticGraphConstruction.post_embedding_builder(body, self._use_bert, title)
modules = self.process_module_names(modules)
tag_list = self.parse_tag_list(tags)[:self._first_n_tags]
for tag in tag_list:
self._tag_to_answer_edges.append((self._known_tags[tag], i))
for module in modules:
self._module_to_answer_edges.append((self._known_modules[module], i))
yield torch.concat((word_embedding, code_embedding))
def process_comments(self, comments: pd.DataFrame) -> torch.Tensor:
for i, body, tags in comments[['Body', 'Tags']].itertuples():
word_embedding, code_embedding, modules = StaticGraphConstruction.post_embedding_builder(body, self._use_bert)
modules = self.process_module_names(modules)
tag_list = self.parse_tag_list(tags)[:self._first_n_tags]
for tag in tag_list:
self._tag_to_comment_edges.append((self._known_tags[tag], i))
for module in modules:
self._module_to_comment_edges.append((self._known_modules[module], i))
yield torch.concat((word_embedding, code_embedding))
def process_tags(self):
if not len(self._known_tags):
return None
for tag in self._known_tags:
yield torch.rand(90) # TODO: Map tag name to its embedding
def process_modules(self):
if not len(self._known_modules):
return None
for module in self._known_modules: # TODO: Map module name to its embedding
yield torch.rand(110)
"""
Utility functions
"""
def parse_tag_list(self, tag_list: str) -> List[str]:
tags = [x for x in tag_list[1:-1].split("><") if x not in ['python', 'python-3.x']]
for t in tags:
if t not in self._known_tags:
self._known_tags[t] = len(self._known_tags)
return tags
def process_module_names(self, import_statements: List[Import]):
modules = [i.module[0] for i in import_statements if i.module]
for m in modules:
if m not in self._known_modules:
self._known_modules[m] = len(self._known_modules)
return modules
def construct(self, questions, answers, comments) -> HeteroData:
questions = questions.head(self._post_count_limit)
answers = answers.head(self._post_count_limit)
comments = comments.head(self._post_count_limit)
questions.reset_index(inplace=True)
answers.reset_index(inplace=True)
comments.reset_index(inplace=True)
question_nodes = list(self.process_questions(questions))
answer_nodes = list(self.process_answers(answers))
comment_nodes = list(self.process_comments(comments))
tag_nodes = list(self.process_tags())
module_nodes = list(self.process_modules())
if len(question_nodes):
self._data['question'].x = torch.stack(question_nodes)
if len(answer_nodes):
self._data['answer'].x = torch.stack(answer_nodes)
if len(comment_nodes):
self._data['comment'].x = torch.stack(comment_nodes)
if len(tag_nodes):
self._data['tag'].x = torch.stack(tag_nodes)
if len(module_nodes):
self._data['module'].x = torch.stack(module_nodes)
self._data['tag', 'describes', 'question'].edge_index = torch.tensor(self._tag_to_question_edges).t().contiguous()
self._data['tag', 'describes', 'answer'].edge_index = torch.tensor(self._tag_to_answer_edges).t().contiguous()
self._data['tag', 'describes', 'comment'].edge_index = torch.tensor(self._tag_to_comment_edges).t().contiguous()
self._data['module', 'imported_in', 'question'].edge_index = torch.tensor(self._module_to_question_edges).t().contiguous()
self._data['module', 'imported_in', 'answer'].edge_index = torch.tensor(self._module_to_answer_edges).t().contiguous()
return self._data
from abc import ABC
import networkx as nx
import pandas as pd
from graph4nlp.pytorch.data import GraphData
from graph4nlp.pytorch.modules.graph_construction import DependencyBasedGraphConstruction
from graph4nlp.pytorch.modules.graph_construction.base import StaticGraphConstructionBase
from matplotlib import pyplot as plt
class StaticUserGraphConstruction:
"""Class for StackOverflow user activity graph construction"""
def __init__(self):
super(StaticUserGraphConstruction, self).__init__()
def static_topology(cls, questions: pd.DataFrame, answers: pd.DataFrame, comments: pd.DataFrame) -> GraphData:
cls._construct_static_graph()
@classmethod
def _construct_static_graph(cls, questions: pd.DataFrame, answers: pd.DataFrame, comments: pd.DataFrame):
user_graph = GraphData()
next_node = 0
color_map = []
node_features = []
tag_dict = {} # tag name: node id
module_dict = {}
edges_src = []
edges_dest = []
@classmethod
def display_graph(cls, g: GraphData, color_map=None) -> None:
plt.figure(figsize=(40, 40))
dgl_ug = g.to_dgl()
nx_ug_graph = dgl_ug.to_networkx()
pos_ug = nx.spring_layout(nx_ug_graph) # , k=0.15, iterations=20)
if color_map is not None:
nx.draw(nx_ug_graph, pos_ug, with_labels=True, node_color=color_map)
else:
nx.draw(nx_ug_graph, pos_ug, with_labels=True, node_color=[[.7, .7, .7]])
if __name__ == '__main__':
t = DependencyBasedGraphConstruction(None)
t(None)
#graph_topology = StaticUserGraphConstruction()
#graph_topology(GraphData())
This diff is collapsed.
%% Cell type:code id: tags:
``` python
import sqlite3
# Create your connection.
db = sqlite3.connect('../stackoverflow.db')
```
%% Cell type:code id: tags:
``` python
QUESTIONS_RETREIVED = 70000
```
%% Cell type:code id: tags:
``` python
import pandas as pd
valid_questions = pd.read_sql_query(f"""
SELECT * FROM Post Q
INNER JOIN Post A ON Q.PostId = A.ParentId
WHERE (Q.Tags LIKE '%<python>%')
GROUP BY A.ParentId
HAVING SUM(A.Score) > 15
LIMIT {QUESTIONS_RETREIVED}
""", db)
#valid_questions.set_index('PostId', inplace=True)
valid_questions
```
%% Output
PostId PostTypeId AcceptedAnswerId CreationDate Score \
0 337 1 342.0 2008-08-02T03:35:55.697 82
1 469 1 3040.0 2008-08-02T15:11:16.430 47
2 502 1 7090.0 2008-08-02T17:01:58.500 58
3 535 1 541.0 2008-08-02T18:43:54.787 68
4 594 1 595.0 2008-08-03T01:15:08.507 55
... ... ... ... ... ...
69995 53827601 1 53827671.0 2018-12-18T06:39:38.237 14
69996 53829045 1 53829145.0 2018-12-18T08:32:12.813 14
69997 53829896 1 53829925.0 2018-12-18T09:26:32.470 11
69998 53830081 1 53830333.0 2018-12-18T09:37:15.270 12
69999 53832607 1 NaN 2018-12-18T12:03:57.233 11
ViewCount Body \
0 10098 <p>I am about to build a piece of a project th...
1 4372 <p>I am using the Photoshop's javascript API t...
2 17199 <p>I have a cross-platform (Python) applicatio...
3 9540 <p>I am starting to work on a hobby project wi...
4 57408 <p>There are several ways to iterate over a re...
... ... ...
69995 23334 <p>I am getting below error. Is there any way ...
69996 1351 <pre><code>variable=";CREATEDBY~string~1~~72~0...
69997 63713 <p>I'm trying with the code from link below to...
69998 97349 <p>I am currently trying to compare values fro...
69999 30479 <p>I have a list:</p>\n<pre><code>ueid_list = ...
OwnerUserId LastEditorUserId LastEditorDisplayName ... \
0 111.0 2336654.0 None ...
1 147.0 1997093.0 Ash ...
2 147.0 9780149.0 Adam Mitz ...
3 154.0 7232508.0 Robert Gamble ...
4 116.0 116.0 Mark Harrison ...
... ... ... ... ...
69995 8930395.0 8930395.0 None ...
69996 2996372.0 63550.0 None ...
69997 8410477.0 NaN None ...
69998 10805303.0 9698684.0 None ...
69999 10779037.0 63550.0 None ...
LastEditorDisplayName LastEditDate Title Tags AnswerCount \
0 None None None None None
1 None None None None None
2 Jeff Atwood 2015-08-29T23:20:48.107 None None None
3 user1873471 2013-01-28T03:54:17.217 None None None
4 None 2016-10-15T20:47:11.027 None None None
... ... ... ... ... ...
69995 None None None None None
69996 None 2018-12-18T09:33:34.967 None None None
69997 None 2018-12-18T09:31:48.723 None None None
69998 None 2020-05-20T13:51:35.490 None None None
69999 None 2021-07-20T11:07:51.030 None None None
CommentCount FavoriteCount CommunityOwnedDate ContentLicense ParentId
0 1 None None CC BY-SA 2.5 337
1 0 None None CC BY-SA 2.5 469
2 0 None None CC BY-SA 3.0 502
3 4 None None CC BY-SA 3.0 535
4 2 None None CC BY-SA 3.0 594
... ... ... ... ... ...
69995 1 None None CC BY-SA 4.0 53827601
69996 6 None None CC BY-SA 4.0 53829045
69997 0 None None CC BY-SA 4.0 53829896
69998 0 None None CC BY-SA 4.0 53830081
69999 0 None None CC BY-SA 4.0 53832607
[70000 rows x 38 columns]
%% Cell type:code id: tags:
``` python
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment