Dataset implemented

2e26cfaf · Liam Byrne · 8afc2c7a · 2e26cfaf · 2e26cfaf · 2e26cfaf
Commit 2e26cfaf authored 2 years ago by Liam Byrne
--- a/embeddings/GAT.py
+++ b/embeddings/GAT.py
+import torch
+from torch_geometric.loader import DataLoader
+from torch_geometric.nn import GATConv, Linear, MeanAggregation, to_hetero
+
+from dataset import UserGraphDataset
+
+
+class GAT(torch.nn.Module):
+    def __init__(self, hidden_channels):
+        super(GAT, self).__init__()
+        torch.manual_seed(12345)
+        self.conv1 = GATConv((-1, -1), hidden_channels, add_self_loops=False)
+        self.conv2 = GATConv((-1, -1), hidden_channels, add_self_loops=False)
+        self.conv3 = GATConv((-1, -1), hidden_channels, add_self_loops=False)
+        self.lin = Linear(hidden_channels, 3)
+        self.pool = MeanAggregation()
+
+    def forward(self, x, edge_index, batch, post_emb):
+        # 1. Obtain node embeddings
+        x = self.conv1(x, edge_index)
+        x = x.relu()
+        x = self.conv2(x, edge_index)
+        x = x.relu()
+        x = self.conv3(x, edge_index)
+
+        # 2. Readout layer
+        x = self.pool(x, batch)  # [batch_size, hidden_channels]
+
+        x = torch.concat([x, post_emb])
+        # 3. Apply a final classifier
+        #x = F.dropout(x, p=0.5, training=self.training)
+        x = self.lin(x)
+
+        return x
+
+
+def train(model, train_loader):
+    model.train()
+
+    for data in train_loader:  # Iterate in batches over the training dataset.
+         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
+         loss = criterion(out, data.y)  # Compute the loss.
+         loss.backward()  # Derive gradients.
+         optimizer.step()  # Update parameters based on gradients.
+         optimizer.zero_grad()  # Clear gradients.
+
+
+def test(loader):
+    model.eval()
+
+    correct = 0
+    for data in loader:  # Iterate in batches over the training/test dataset.
+        out = model(data.x, data.edge_index, data.batch)
+        pred = out.argmax(dim=1)  # Use the class with highest probability.
+        correct += int((pred == data.y).sum())  # Check against ground-truth labels.
+    return correct / len(loader.dataset)  # Derive ratio of correct predictions.
+
+
+if __name__ == '__main__':
+    dataset = UserGraphDataset(root="data")
+    train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [0.6, 0.1, 0.3])
+
+    train_loader = DataLoader(train_dataset, batch_size=64)
+    val_loader = DataLoader(val_dataset, batch_size=64)
+    test_loader = DataLoader(test_dataset, batch_size=64)
+
+
+    model = GAT(hidden_channels=64)
+
+    example_graph = train_loader[0]
+    print(example_graph)
+    model = to_hetero(model, example_graph.metadata())
+
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
+    criterion = torch.nn.CrossEntropyLoss()
+
+    for epoch in range(1, 10):
+        train(model, train_loader)
+        train_acc = test(train_loader)
+        print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')
+
--- a/embeddings/ModuleEmbeddings.py
+++ b/embeddings/ModuleEmbeddings.py
+import itertools
+import logging
+import random
+import sqlite3
+from typing import *
+
+import pandas as pd
+import torch
+from bs4 import BeautifulSoup
+from torch import nn, optim
+from torch.functional import F
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+
+from post_embedding_builder import PostEmbedding
+
+logging.basicConfig(level=logging.INFO)
+
+
+class ModuleEmbeddingTrainer:
+
+    def __init__(self, emb_size: int, database_path: str = None):
+        logger = logging.getLogger(self.__class__.__name__)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logger.info(f"Proceeding with {self.device} . .")
+        if database_path is not None:
+            self.db = sqlite3.connect(database_path)
+            logger.info(f"Connected to {database_path}")
+        self.emb_size = emb_size
+        self.emb_builder = PostEmbedding()
+
+    def from_files(self, post_tags_path: str, tag_vocab: str):
+        pass
+
+    def from_db(self):
+        post_body_series = pd.read_sql_query("SELECT Body FROM Post WHERE (Tags LIKE '%python%') AND (Body LIKE '%import%')  LIMIT 100000", self.db)
+
+        modules_series = post_body_series['Body'].apply(lambda html: [x.module for x in self.emb_builder.get_imports_via_regex(BeautifulSoup(html, 'lxml'))])
+        self.module_vocab = list(set(modules_series.sum()))
+
+        combinations = modules_series.apply(lambda row: list(itertools.combinations(row, 2)))
+        combinations = combinations[combinations.astype(str) != '[]']
+
+        # Now concatenate all the lists together
+        module_pairs = []
+        for i in combinations:
+            module_pairs += i
+        self.training_pairs = module_pairs
+
+    def sample_n(self, df, train_size: int):
+        return random.sample(df, train_size)
+
+    def train(self, train_size: int, epochs: int):
+        # Loss
+        loss_function = nn.NLLLoss()
+        losses = []
+        # Model
+        self.model = ModuleEmbedding(vocab_size=len(self.module_vocab), embedding_dim=self.emb_size).to(self.device)
+        # Optimizer
+        optimizer = optim.SGD(self.model.parameters(), lr=0.001)
+        # Enumerate the vocabulary, reflects the index of where the 1 is in the one-hot
+        self.tag_to_ix = {tag: i for i, tag in enumerate(self.module_vocab)}
+        # Reduce size of training set
+        samples = self.sample_n(self.training_pairs, train_size)
+
+        for epoch in range(epochs):
+            total_loss = 0
+            for tag_a, tag_b in tqdm(samples):
+                tag_a_id = torch.tensor(self.tag_to_ix[tag_a], dtype=torch.long).to(self.device)
+                self.model.zero_grad()
+                log_probs = self.model(tag_a_id)
+                loss = loss_function(log_probs.flatten(), torch.tensor(self.tag_to_ix[tag_b], dtype=torch.long).to(self.device))
+                loss.backward()
+                optimizer.step()
+                total_loss += loss.item()
+            losses.append(total_loss)
+
+    def get_tag_embedding(self, tag: str):
+        return self.model.embedding.weight[self.tag_to_ix[tag]]
+
+    def to_tensorboard(self, run_name: str):
+        """
+        Write embedding to Tensorboard projector
+        tensorboard --logdir="runs/run@20221102-173048"
+        """
+        writer = SummaryWriter(f'runs/{run_name}')
+        writer.add_embedding(self.model.embedding.weight,
+                             metadata=self.module_vocab,
+                             tag=f'Next-Tag embedding')
+        writer.close()
+
+    def load_model(self, model_path: str, vocab_size: int, embedding_dim: int):
+        self.model = ModuleEmbedding(vocab_size, embedding_dim)
+        self.model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
+
+    def save_model(self, model_path: str):
+        torch.save(self.model.state_dict(), model_path)
+
+
+class ModuleEmbedding(nn.Module):
+
+    def __init__(self, vocab_size, embedding_dim):
+        super(ModuleEmbedding, self).__init__()
+        self.embedding = nn.Embedding(vocab_size, embedding_dim)
+        self.linear1 = nn.Linear(embedding_dim, 128)
+        self.linear2 = nn.Linear(128, vocab_size)
+
+    def forward(self, inputs):
+        embeds = self.embedding(inputs).view((1, -1))
+        out = F.relu(self.linear1(embeds))
+        out = self.linear2(out)
+        log_probs = F.log_softmax(out, dim=1)
+        return log_probs
+
+
+if __name__ == '__main__':
+    met = ModuleEmbeddingTrainer(emb_size=30, database_path='../stackoverflow.db')
+    met.from_db()
+    print(len(met.training_pairs))
+    print(len(met.module_vocab))
+
+
+    #tet = NextTagEmbeddingTrainer(context_length=3, emb_size=50)
+
+    #tet.from_files("../data/raw/all_tags.csv", "../data/raw/tag_vocab.csv")
+    # assert len(tet.post_tags) == 84187510, "Incorrect number of post tags!"
+    # assert len(tet.tag_vocab) == 63653, "Incorrect vocab size!"
+
+    met.train(1000, 1)
+    # tet.to_tensorboard(f"run@{time.strftime('%Y%m%d-%H%M%S')}")
+
+    # tet.save_model("25mil.pt")
+    # tet.load_model("10mil_500d_embd.pt", 63653, 500)
+    # tet.to_tensorboard(f"run@{time.strftime('%Y%m%d-%H%M%S')}")
\ No newline at end of file
--- a/embeddings/NextTagEmbedding.py
+++ b/embeddings/NextTagEmbedding.py
@@ -16,7 +16,7 @@ logging.basicConfig(level=logging.INFO)

 class NextTagEmbeddingTrainer:

-    def __init__(self, context_length: int, emb_size: int, database_path: str = None):
+    def __init__(self, context_length: int, emb_size: int, excluded_tags=None, database_path: str = None):
        logger = logging.getLogger(self.__class__.__name__)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        logger.info(f"Proceeding with {self.device} . .")
@@ -27,14 +27,20 @@ class NextTagEmbeddingTrainer:
        self.post_tags: List[Tuple]
        self.context_length = context_length
        self.emb_size = emb_size
+        self.excluded_tags = excluded_tags
+

    def build_cbow(self, tags: List[str], context_len: int) -> List[Tuple]:
-        pairs = []
-        if len(tags) <= 1:
+
+        filtered_tags = [t for t in tags if t not in self.excluded_tags]
+
+        if len(filtered_tags) <= 1:
            return []

-        for target in tags:
-            context = [t for t in tags if t != target]
+        pairs = []
+
+        for target in filtered_tags:
+            context = [t for t in filtered_tags if t != target]
            # Pad or cut depending on the context length
            while len(context) < context_len:
                context.append('PAD')
@@ -58,17 +64,16 @@ class NextTagEmbeddingTrainer:
        self.post_tags = tag_pairs

    def from_db(self):
-        tag_df = pd.read_sql_query("SELECT * FROM Tag", self.db)
-        tag_df.set_index('TagId', inplace=True)
-        self.tag_vocab = list(set(tag_df["TagName"]))
-
-        post_tags = pd.read_sql_query(f"SELECT Tags FROM Post WHERE PostTypeId=1", self.db)
+        post_tags = pd.read_sql_query(f"SELECT Tags FROM Post WHERE PostTypeId=1 AND Tags LIKE '%python%' LIMIT 100000", self.db)
        tag_list_df = post_tags['Tags'].map(self.parse_tag_list)
-        combinations = tag_list_df.apply(lambda row: list(itertools.combinations(row, 2)))
-        combinations = combinations[combinations.astype(str) != '[]']
+
+        self.tag_vocab = list(set(tag_list_df.sum() + ["PAD"]))
+
+        context_and_target = tag_list_df.apply(lambda row: self.build_cbow(row, self.context_length))
+        context_and_target = context_and_target[context_and_target.astype(str) != '[]']
        # Now concatenate all the lists together
        tag_pairs = []
-        for i in combinations:
+        for i in context_and_target:
            tag_pairs += i
        self.post_tags = tag_pairs

@@ -143,15 +148,19 @@ class NextTagEmbedding(nn.Module):


 if __name__ == '__main__':
-    # tet = TagEmbeddingTrainer("../stackoverflow.db")
-    # tet.from_db()
-    tet = NextTagEmbeddingTrainer(context_length=3, emb_size=30)
+    tet = NextTagEmbeddingTrainer(context_length=2, emb_size=30, excluded_tags=['python'], database_path="../stackoverflow.db")
+    tet.from_db()
+    print(len(tet.post_tags))
+    print(len(tet.tag_vocab))
+
+
+    #tet = NextTagEmbeddingTrainer(context_length=3, emb_size=50)

-    tet.from_files("../data/raw/all_tags.csv", "../data/raw/tag_vocab.csv")
+    #tet.from_files("../data/raw/all_tags.csv", "../data/raw/tag_vocab.csv")
    # assert len(tet.post_tags) == 84187510, "Incorrect number of post tags!"
    # assert len(tet.tag_vocab) == 63653, "Incorrect vocab size!"

-    tet.train(100, 1)
+    tet.train(1000, 1)
    # tet.to_tensorboard(f"run@{time.strftime('%Y%m%d-%H%M%S')}")

    # tet.save_model("25mil.pt")

--- a/embeddings/__pycache__/post_embedding_builder.cpython-38.pyc
+++ b/embeddings/__pycache__/post_embedding_builder.cpython-38.pyc
--- a/embeddings/__pycache__/static_graph_construction.cpython-38.pyc
+++ b/embeddings/__pycache__/static_graph_construction.cpython-38.pyc
--- a/embeddings/dataset.py
+++ b/embeddings/dataset.py
+import logging
+import os.path
+import pickle
+import sqlite3
+
+import pandas as pd
+import torch
+from bs4 import MarkupResemblesLocatorWarning
+from torch_geometric.data import Dataset, download_url, Data
+from tqdm import tqdm
+import warnings
+warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning)
+
+from post_embedding_builder import PostEmbedding
+from static_graph_construction import StaticGraphConstruction
+
+logging.basicConfig()
+#logging.getLogger().setLevel(logging.ERROR)
+log = logging.getLogger("dataset")
+
+
+class UserGraphDataset(Dataset):
+    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None, db_address:str=None, question_count=70000):
+        self._question_count = question_count
+        # Connect to database.
+        if db_address is not None:
+            self._db = sqlite3.connect(db_address)
+            self._post_embedding_builder = PostEmbedding()
+        super().__init__(root, transform, pre_transform, pre_filter)
+
+    @property
+    def raw_file_names(self):
+        return []
+
+    @property
+    def processed_file_names(self):
+        return os.listdir("data/processed")
+
+    def download(self):
+        pass
+
+    def process(self):
+        idx = 0
+        valid_questions = self.fetch_valid_questions()
+        for row in tqdm(valid_questions.itertuples(), total=len(valid_questions)):
+            # Build Question embedding
+            question_emb = self._post_embedding_builder(
+                row.question_body,
+                use_bert=True,
+                title=row.question_title
+            )
+            answers_to_question = self.fetch_answers_for_question(row.post_id)
+            # Build Answer embeddings
+            for _, answer_body, answer_user_id, score in answers_to_question.itertuples():
+                answer_emb = self._post_embedding_builder(
+                    answer_body,
+                    use_bert=True
+                )
+                # Build graph
+                graph = self.construct_graph(answer_user_id)
+                # pytorch geometric data object
+                data = Data(
+                    x=graph.x_dict,
+                    edge_index=graph.edge_index_dict,
+                    y=torch.LongTensor(1 if score > 0 else 0),
+                    question_emb=question_emb,
+                    answer_emb=answer_emb
+                )
+                torch.save(data, os.path.join(self.processed_dir, f'data_{idx}.pt'))
+                idx += 1
+
+    def len(self):
+        return len(self.processed_file_names)
+
+    def get(self, idx):
+        data = torch.load(os.path.join(self.processed_dir, f'data_{idx}.pt'))
+        return data
+
+    '''
+    Database functions
+    '''
+
+    def fetch_valid_questions(self):
+        valid_questions = pd.read_sql_query(f"""
+                SELECT Q.PostId, Q.Body, Q.Title, Q.OwnerUserId FROM Post Q
+                INNER JOIN Post A ON Q.PostId = A.ParentId
+                WHERE (Q.Tags LIKE '%<python>%')
+                GROUP BY A.ParentId
+                HAVING SUM(A.Score) > 15
+                LIMIT {self._question_count}
+        """, self._db)
+        valid_questions.columns = ['post_id', 'question_body', 'question_title', 'question_user_id']
+        return valid_questions
+
+    def fetch_questions_by_user(self, user_id: int):
+        questions_df = pd.read_sql_query(f"""
+                SELECT *
+                FROM Post
+                WHERE Tags LIKE '%python%' AND (PostTypeId = 1) AND ((LastEditorUserId = {user_id}) OR (OwnerUserId = {user_id}))
+        """, self._db)
+        questions_df.set_index('PostId', inplace=True)
+        return questions_df
+
+    def fetch_answers_by_user(self, user_id: int):
+        answers_df = pd.read_sql_query(f"""
+                SELECT A.Tags, B.*
+                FROM Post A
+                    INNER JOIN Post B ON (B.ParentId = A.PostId) AND (B.ParentId IS NOT NULL)
+                WHERE A.Tags LIKE '%python%' AND (B.PostTypeId = 2) AND ((B.LastEditorUserId = {user_id}) OR (B.OwnerUserId = {user_id}))
+        """, self._db)
+        answers_df = answers_df.loc[:, ~answers_df.columns.duplicated()].copy()
+        answers_df.set_index('PostId', inplace=True)
+        return answers_df
+
+    def fetch_answers_for_question(self, question_post_id: int):
+        answers_df = pd.read_sql_query(f"""
+                SELECT Body, OwnerUserId, Score
+                FROM Post
+                WHERE ParentId = {question_post_id}
+        """, self._db)
+        answers_df = answers_df.dropna()
+        return answers_df
+
+    def fetch_comments_by_user(self, user_id: int):
+        comments_on_questions_df = pd.read_sql_query(f"""
+                SELECT A.Tags, B.*
+                FROM Post A
+                    INNER JOIN Comment B ON (B.PostId = A.PostId)
+                WHERE A.Tags LIKE '%python%' AND (B.UserId = {user_id}) AND (A.PostTypeId = 1)
+        """, self._db)
+        comments_on_questions_df.set_index('CommentId', inplace=True)
+
+        comments_on_answers_df = pd.read_sql_query(f"""
+            SELECT A.Tags, C.*
+            FROM Post A
+                INNER JOIN Post B ON (B.ParentId = A.PostId) AND (B.ParentId IS NOT NULL)
+                INNER JOIN Comment C ON (B.PostId = C.PostId)
+            WHERE A.Tags LIKE '%python%' AND (C.UserId = {user_id}) AND (B.PostTypeId = 2)
+        """, self._db)
+        comments_on_answers_df.set_index('CommentId', inplace=True)
+
+        return pd.concat([comments_on_questions_df, comments_on_answers_df])
+
+    def construct_graph(self, user_id: int):
+        graph_constructor = StaticGraphConstruction()
+        qs = self.fetch_questions_by_user(user_id)
+        ans = self.fetch_answers_by_user(user_id)
+        cs = self.fetch_comments_by_user(user_id)
+        return graph_constructor.construct(questions=qs, answers=ans, comments=cs)
+
+
+if __name__ == '__main__':
+    ds = UserGraphDataset('../data/', db_address='../stackoverflow.db', question_count=100)
+    print(ds.get(0))
\ No newline at end of file
--- a/embeddings/post_embedding_builder.py
+++ b/embeddings/post_embedding_builder.py
-import ast, astunparse
+import ast
 import io
-import time
+import logging
+import re
 import tokenize
 from collections import namedtuple
 from typing import List

+logging.basicConfig()
+logging.getLogger().setLevel(logging.INFO)
+log = logging.getLogger(__name__)
+
+
 from bs4 import BeautifulSoup
 import spacy
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
 from torchtext.vocab import GloVe
 from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel

@@ -25,7 +29,8 @@ class PostEmbedding(nn.Module):

    def __init__(self):
        super().__init__()
-        self._global_vectors = GloVe(name='840B', dim=300)
+        log.info("PostEmbedding instantiated!")
+        #self._global_vectors = GloVe(name='840B', dim=300)
        self._en = spacy.load('en_core_web_sm')
        self._stopwords = self._en.Defaults.stop_words
        self._bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
@@ -33,7 +38,7 @@ class PostEmbedding(nn.Module):
        self._code_bert_tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
        self._code_bert_model = AutoModel.from_pretrained("microsoft/codebert-base")

-    def forward(self, html: str, title: str=None, flatten=True) -> torch.tensor:
+    def forward(self, html: str, use_bert: bool, title: str=None, flatten=True) -> torch.tensor:
        """
        @param html: HTML string of the body of a StackOverflow post.
        @param title: Title of a question post.
@@ -43,12 +48,16 @@ class PostEmbedding(nn.Module):

        soup = BeautifulSoup(html, 'lxml')
        ps = self.get_paragraphs(soup, title)
-
+        if use_bert:
+            para_emb = self.to_bert_embedding(" ".join(ps))
+        else:
            para_emb = self.to_glove_paragraph_embedding(ps)

-        code = self.get_code(soup)
+        modules, funcs = self.get_code(soup, get_imports_with_regex=True)
+
+        code_bert = self.to_code_bert_embedding("\n".join([x.get_text() for x in soup.find_all('code')]))

-        return para_emb, code
+        return para_emb, code_bert, modules

    def preprocess(self, text: str) -> List[str]:
        """
@@ -72,7 +81,7 @@ class PostEmbedding(nn.Module):
            paras.append(self.preprocess(title))
        return [token for para in paras for token in para]

-    def get_code(self, soup: BeautifulSoup) -> (List[Import], List[Function]):
+    def get_code(self, soup: BeautifulSoup, get_imports_with_regex=False, get_functions_with_regex=False) -> (List[Import], List[Function]):
        """
        @param soup: Post body HTML wrapped in a BeautifulSoup object.
        @return: Combined string of code snippets
@@ -82,8 +91,15 @@ class PostEmbedding(nn.Module):
            syntax_tree = ast.parse(code_snippet)
        except SyntaxError:
            return ([],[])
-        modules = list(self.get_imports(syntax_tree))
-        function_defs = list(self.get_function(syntax_tree))
+        if get_imports_with_regex:
+            modules = list(self.get_imports_via_regex(soup))
+        else:
+            modules = list(self.get_imports_via_ast(syntax_tree))
+
+        if get_functions_with_regex:
+            raise NotImplementedError("RegEx implementation for function names not implemented yet . .")
+        else:
+            function_defs = list(self.get_function_via_ast(syntax_tree))
        return modules, function_defs

    def to_glove_paragraph_embedding(self, tokens: List[str]) -> torch.tensor:
@@ -98,10 +114,13 @@ class PostEmbedding(nn.Module):

    def to_bert_embedding(self, text: str) -> torch.tensor:
        sentences = [i.text for i in self._en(text).sents]
-        encodings = self._tokenizer(sentences, padding=True, return_tensors='pt')
+        if not len(sentences):
+            return torch.zeros(768)
+        encodings = self._bert_tokenizer(sentences, padding=True, truncation=True, return_tensors='pt', max_length=512)
        with torch.no_grad():
-            embeds = self._bert_model(**encodings)
-        return embeds.mean(dim=1).mean(dim=0)
+            outputs = self._bert_model(**encodings, output_hidden_states=True)
+            cls = outputs.hidden_states[-1][0,0,:]
+        return cls


    def to_code_bert_embedding(self, code):
@@ -112,30 +131,61 @@ class PostEmbedding(nn.Module):
        """
        # First, get the comments from the Python code (NL)
        buf = io.StringIO(code)
-        comments = [line.string for line in tokenize.generate_tokens(buf.readline) if line.type == tokenize.COMMENT]
-        comments = " ".join(comments)
-        print(comments)
+        source = []
+        comments = []
+
+        token_gen = tokenize.generate_tokens(buf.readline)
+
+        while True:
+            try:
+                token = next(token_gen)
+                if token.type == tokenize.COMMENT:
+                    comments.append(token.string)
+                else:
+                    source.append(token.string)
+            except tokenize.TokenError:
+                continue
+            except StopIteration:
+                break
+            except IndentationError:
+                continue

-        nl_tokens = self._code_bert_tokenizer.tokenize(comments)
+        nl_tokens = self._code_bert_tokenizer.tokenize(" ".join(comments))

-        syntax_tree = ast.parse(code)
-        uncommented = astunparse.unparse(syntax_tree)
-        code_tokens = self._code_bert_tokenizer.tokenize(uncommented)
+        code_tokens = self._code_bert_tokenizer.tokenize("".join(source))
+
+        # CodeBERT has a max token length of 512
+        while len(nl_tokens) + len(code_tokens) > 509:
+            if len(nl_tokens) > len(code_tokens):
+                nl_tokens = nl_tokens[:-1]
+            else:
+                code_tokens = code_tokens[:-1]

        tokens = [self._code_bert_tokenizer.cls_token] + nl_tokens + [self._code_bert_tokenizer.sep_token] + code_tokens + [self._code_bert_tokenizer.eos_token]
        tokens_ids = self._code_bert_tokenizer.convert_tokens_to_ids(tokens)
-        print(len(tokens))
-        return self._code_bert_model(torch.tensor(tokens_ids)[None,:])[0]

+        emb = self._code_bert_model(torch.tensor(tokens_ids)[None,:])[0]
+        return emb.mean(dim=1).mean(dim=0)
+
+
+    """
+    Python RegEx methods
+    """
+
+    def get_imports_via_regex(self, soup) -> Import:
+        code_snippet = "\n".join([x.get_text() for x in soup.find_all('code')])

+        PATTERN = r'^\s*(?:from|import)\s+(\w+(?:\s*,\s*\w+)*)'

+        for module in list(set(re.findall(PATTERN, code_snippet, flags=re.MULTILINE))):
+            yield Import(module, None, None)


    """
    Python Abstract Syntax Tree methods
    """

-    def get_imports(self, syntax_tree) -> Import:
+    def get_imports_via_ast(self, syntax_tree) -> Import:
        """
        @param code_snippet:
        @return:
@@ -151,7 +201,7 @@ class PostEmbedding(nn.Module):
            for n in node.names:
                yield Import(module, n.name.split('.'), n.asname)

-    def get_function(self, syntax_tree) -> Function:
+    def get_function_via_ast(self, syntax_tree) -> Function:
        """
        @param code_snippet:
        @return:
@@ -164,4 +214,6 @@ class PostEmbedding(nn.Module):

 if __name__ == '__main__':
    pe = PostEmbedding()
-    print(pe.to_code_bert_embedding("def a(self: int) -> Function: #hello\n    a+2\n    return a").shape)
+    #print(pe.to_code_bert_embedding("\n".join(["for i in range(32):\n    #return 6 or something\n"])).shape)
+    print(pe.to_bert_embedding("This is a test sentence."))
+    #print([x.module for x in pe.get_imports_via_regex(BeautifulSoup("<code>import ast<\code>", 'lxml'))])
--- a/embeddings/pyg_construction_demo.ipynb
+++ b/embeddings/pyg_construction_demo.ipynb
--- a/embeddings/static_graph_construction.py
+++ b/embeddings/static_graph_construction.py
+from typing import List
+import logging
+logging.basicConfig()
+logging.getLogger().setLevel(logging.INFO)
+log = logging.getLogger(__name__)
+
+import pandas as pd
+import torch
+from torch_geometric.data import HeteroData
+
+from post_embedding_builder import Import, PostEmbedding
+
+
+class StaticGraphConstruction:
+
+    # PostEmbedding is costly to put in constructor
+    post_embedding_builder = PostEmbedding()
+
+    def __init__(self):
+        self._known_tags = {}  # tag_name -> index
+        self._known_modules = {}  # module_name -> index
+        self._data = HeteroData()
+        self._first_n_tags = 3
+
+        self._tag_to_question_edges = []
+        self._tag_to_answer_edges = []
+        self._tag_to_comment_edges = []
+
+        self._module_to_question_edges = []
+        self._module_to_answer_edges = []
+        self._module_to_comment_edges = []
+        self._use_bert = True
+        self._post_count_limit = 10
+
+
+
+    def process_questions(self, questions: pd.DataFrame) -> torch.Tensor:
+        for i, body, title, tags in questions[['Body', 'Title', 'Tags']].itertuples():
+            word_embedding, code_embedding, modules = StaticGraphConstruction.post_embedding_builder(body, self._use_bert, title)
+            modules = self.process_module_names(modules)
+            tag_list = self.parse_tag_list(tags)[:self._first_n_tags]
+
+            for tag in tag_list:
+                self._tag_to_question_edges.append((self._known_tags[tag], i))
+
+            for module in modules:
+                self._module_to_question_edges.append((self._known_modules[module], i))
+
+            yield torch.concat((word_embedding, code_embedding))
+
+
+    def process_answers(self, answers: pd.DataFrame) -> torch.Tensor:
+        for i, body, title, tags in answers[['Body', 'Title', 'Tags']].itertuples():
+            word_embedding, code_embedding, modules = StaticGraphConstruction.post_embedding_builder(body, self._use_bert, title)
+            modules = self.process_module_names(modules)
+            tag_list = self.parse_tag_list(tags)[:self._first_n_tags]
+
+            for tag in tag_list:
+                self._tag_to_answer_edges.append((self._known_tags[tag], i))
+
+            for module in modules:
+                self._module_to_answer_edges.append((self._known_modules[module], i))
+
+            yield torch.concat((word_embedding, code_embedding))
+
+    def process_comments(self, comments: pd.DataFrame) -> torch.Tensor:
+        for i, body, tags in comments[['Body', 'Tags']].itertuples():
+            word_embedding, code_embedding, modules = StaticGraphConstruction.post_embedding_builder(body, self._use_bert)
+            modules = self.process_module_names(modules)
+            tag_list = self.parse_tag_list(tags)[:self._first_n_tags]
+
+            for tag in tag_list:
+                self._tag_to_comment_edges.append((self._known_tags[tag], i))
+
+            for module in modules:
+                self._module_to_comment_edges.append((self._known_modules[module], i))
+
+            yield torch.concat((word_embedding, code_embedding))
+
+    def process_tags(self):
+        if not len(self._known_tags):
+            return None
+        for tag in self._known_tags:
+            yield torch.rand(90)  # TODO: Map tag name to its embedding
+
+    def process_modules(self):
+        if not len(self._known_modules):
+            return None
+        for module in self._known_modules: # TODO: Map module name to its embedding
+            yield torch.rand(110)
+
+    """
+    Utility functions
+    """
+    def parse_tag_list(self, tag_list: str) -> List[str]:
+        tags = [x for x in tag_list[1:-1].split("><") if x not in ['python', 'python-3.x']]
+        for t in tags:
+            if t not in self._known_tags:
+                self._known_tags[t] = len(self._known_tags)
+        return tags
+
+    def process_module_names(self, import_statements: List[Import]):
+        modules = [i.module[0] for i in import_statements if i.module]
+        for m in modules:
+            if m not in self._known_modules:
+                self._known_modules[m] = len(self._known_modules)
+        return modules
+
+    def construct(self, questions, answers, comments) -> HeteroData:
+        questions = questions.head(self._post_count_limit)
+        answers = answers.head(self._post_count_limit)
+        comments = comments.head(self._post_count_limit)
+
+        questions.reset_index(inplace=True)
+        answers.reset_index(inplace=True)
+        comments.reset_index(inplace=True)
+
+        question_nodes = list(self.process_questions(questions))
+        answer_nodes = list(self.process_answers(answers))
+        comment_nodes = list(self.process_comments(comments))
+        tag_nodes = list(self.process_tags())
+        module_nodes = list(self.process_modules())
+
+        if len(question_nodes):
+            self._data['question'].x = torch.stack(question_nodes)
+        if len(answer_nodes):
+            self._data['answer'].x = torch.stack(answer_nodes)
+        if len(comment_nodes):
+            self._data['comment'].x = torch.stack(comment_nodes)
+        if len(tag_nodes):
+            self._data['tag'].x = torch.stack(tag_nodes)
+        if len(module_nodes):
+            self._data['module'].x = torch.stack(module_nodes)
+
+        self._data['tag', 'describes', 'question'].edge_index = torch.tensor(self._tag_to_question_edges).t().contiguous()
+        self._data['tag', 'describes', 'answer'].edge_index = torch.tensor(self._tag_to_answer_edges).t().contiguous()
+        self._data['tag', 'describes', 'comment'].edge_index = torch.tensor(self._tag_to_comment_edges).t().contiguous()
+        self._data['module', 'imported_in', 'question'].edge_index = torch.tensor(self._module_to_question_edges).t().contiguous()
+        self._data['module', 'imported_in', 'answer'].edge_index = torch.tensor(self._module_to_answer_edges).t().contiguous()
+
+        return self._data
--- a/embeddings/static_user_graph_construction.py
+++ b/embeddings/static_user_graph_construction.py
-from abc import ABC
-
-import networkx as nx
-import pandas as pd
-from graph4nlp.pytorch.data import GraphData
-from graph4nlp.pytorch.modules.graph_construction import DependencyBasedGraphConstruction
-from graph4nlp.pytorch.modules.graph_construction.base import StaticGraphConstructionBase
-from matplotlib import pyplot as plt
-
-
-class StaticUserGraphConstruction:
-    """Class for StackOverflow user activity graph construction"""
-
-    def __init__(self):
-        super(StaticUserGraphConstruction, self).__init__()
-
-
-    def static_topology(cls, questions: pd.DataFrame, answers: pd.DataFrame, comments: pd.DataFrame) -> GraphData:
-        cls._construct_static_graph()
-
-    @classmethod
-    def _construct_static_graph(cls, questions: pd.DataFrame, answers: pd.DataFrame, comments: pd.DataFrame):
-        user_graph = GraphData()
-        next_node = 0
-
-        color_map = []
-
-        node_features = []
-        tag_dict = {}  # tag name: node id
-        module_dict = {}
-
-        edges_src = []
-        edges_dest = []
-
-
-    @classmethod
-    def display_graph(cls, g: GraphData, color_map=None) -> None:
-        plt.figure(figsize=(40, 40))
-        dgl_ug = g.to_dgl()
-        nx_ug_graph = dgl_ug.to_networkx()
-        pos_ug = nx.spring_layout(nx_ug_graph)  # , k=0.15, iterations=20)
-        if color_map is not None:
-            nx.draw(nx_ug_graph, pos_ug, with_labels=True, node_color=color_map)
-        else:
-            nx.draw(nx_ug_graph, pos_ug, with_labels=True, node_color=[[.7, .7, .7]])
-
-
-if __name__ == '__main__':
-    t = DependencyBasedGraphConstruction(None)
-    t(None)
-
-    #graph_topology = StaticUserGraphConstruction()
-    #graph_topology(GraphData())
--- a/embeddings/training_set_builder.ipynb
+++ b/embeddings/training_set_builder.ipynb
--- a/training_set_builder/training_set_builder.ipynb
+++ b/training_set_builder/training_set_builder.ipynb
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "import sqlite3\n",
-    "\n",
-    "# Create your connection.\n",
-    "db = sqlite3.connect('../stackoverflow.db')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "outputs": [],
-   "source": [
-    "QUESTIONS_RETREIVED = 70000"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "outputs": [
-    {
-     "data": {
-      "text/plain": "         PostId  PostTypeId  AcceptedAnswerId             CreationDate  Score  \\\n0           337           1             342.0  2008-08-02T03:35:55.697     82   \n1           469           1            3040.0  2008-08-02T15:11:16.430     47   \n2           502           1            7090.0  2008-08-02T17:01:58.500     58   \n3           535           1             541.0  2008-08-02T18:43:54.787     68   \n4           594           1             595.0  2008-08-03T01:15:08.507     55   \n...         ...         ...               ...                      ...    ...   \n69995  53827601           1        53827671.0  2018-12-18T06:39:38.237     14   \n69996  53829045           1        53829145.0  2018-12-18T08:32:12.813     14   \n69997  53829896           1        53829925.0  2018-12-18T09:26:32.470     11   \n69998  53830081           1        53830333.0  2018-12-18T09:37:15.270     12   \n69999  53832607           1               NaN  2018-12-18T12:03:57.233     11   \n\n       ViewCount                                               Body  \\\n0          10098  <p>I am about to build a piece of a project th...   \n1           4372  <p>I am using the Photoshop's javascript API t...   \n2          17199  <p>I have a cross-platform (Python) applicatio...   \n3           9540  <p>I am starting to work on a hobby project wi...   \n4          57408  <p>There are several ways to iterate over a re...   \n...          ...                                                ...   \n69995      23334  <p>I am getting below error. Is there any way ...   \n69996       1351  <pre><code>variable=\";CREATEDBY~string~1~~72~0...   \n69997      63713  <p>I'm trying with the code from link below to...   \n69998      97349  <p>I am currently trying to compare values fro...   \n69999      30479  <p>I have a list:</p>\\n<pre><code>ueid_list = ...   \n\n       OwnerUserId  LastEditorUserId LastEditorDisplayName  ...  \\\n0            111.0         2336654.0                  None  ...   \n1            147.0         1997093.0                   Ash  ...   \n2            147.0         9780149.0             Adam Mitz  ...   \n3            154.0         7232508.0         Robert Gamble  ...   \n4            116.0             116.0         Mark Harrison  ...   \n...            ...               ...                   ...  ...   \n69995    8930395.0         8930395.0                  None  ...   \n69996    2996372.0           63550.0                  None  ...   \n69997    8410477.0               NaN                  None  ...   \n69998   10805303.0         9698684.0                  None  ...   \n69999   10779037.0           63550.0                  None  ...   \n\n      LastEditorDisplayName             LastEditDate Title  Tags  AnswerCount  \\\n0                      None                     None  None  None         None   \n1                      None                     None  None  None         None   \n2               Jeff Atwood  2015-08-29T23:20:48.107  None  None         None   \n3               user1873471  2013-01-28T03:54:17.217  None  None         None   \n4                      None  2016-10-15T20:47:11.027  None  None         None   \n...                     ...                      ...   ...   ...          ...   \n69995                  None                     None  None  None         None   \n69996                  None  2018-12-18T09:33:34.967  None  None         None   \n69997                  None  2018-12-18T09:31:48.723  None  None         None   \n69998                  None  2020-05-20T13:51:35.490  None  None         None   \n69999                  None  2021-07-20T11:07:51.030  None  None         None   \n\n       CommentCount FavoriteCount CommunityOwnedDate ContentLicense  ParentId  \n0                 1          None               None   CC BY-SA 2.5       337  \n1                 0          None               None   CC BY-SA 2.5       469  \n2                 0          None               None   CC BY-SA 3.0       502  \n3                 4          None               None   CC BY-SA 3.0       535  \n4                 2          None               None   CC BY-SA 3.0       594  \n...             ...           ...                ...            ...       ...  \n69995             1          None               None   CC BY-SA 4.0  53827601  \n69996             6          None               None   CC BY-SA 4.0  53829045  \n69997             0          None               None   CC BY-SA 4.0  53829896  \n69998             0          None               None   CC BY-SA 4.0  53830081  \n69999             0          None               None   CC BY-SA 4.0  53832607  \n\n[70000 rows x 38 columns]",
-      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>PostId</th>\n      <th>PostTypeId</th>\n      <th>AcceptedAnswerId</th>\n      <th>CreationDate</th>\n      <th>Score</th>\n      <th>ViewCount</th>\n      <th>Body</th>\n      <th>OwnerUserId</th>\n      <th>LastEditorUserId</th>\n      <th>LastEditorDisplayName</th>\n      <th>...</th>\n      <th>LastEditorDisplayName</th>\n      <th>LastEditDate</th>\n      <th>Title</th>\n      <th>Tags</th>\n      <th>AnswerCount</th>\n      <th>CommentCount</th>\n      <th>FavoriteCount</th>\n      <th>CommunityOwnedDate</th>\n      <th>ContentLicense</th>\n      <th>ParentId</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>337</td>\n      <td>1</td>\n      <td>342.0</td>\n      <td>2008-08-02T03:35:55.697</td>\n      <td>82</td>\n      <td>10098</td>\n      <td>&lt;p&gt;I am about to build a piece of a project th...</td>\n      <td>111.0</td>\n      <td>2336654.0</td>\n      <td>None</td>\n      <td>...</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>1</td>\n      <td>None</td>\n      <td>None</td>\n      <td>CC BY-SA 2.5</td>\n      <td>337</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>469</td>\n      <td>1</td>\n      <td>3040.0</td>\n      <td>2008-08-02T15:11:16.430</td>\n      <td>47</td>\n      <td>4372</td>\n      <td>&lt;p&gt;I am using the Photoshop's javascript API t...</td>\n      <td>147.0</td>\n      <td>1997093.0</td>\n      <td>Ash</td>\n      <td>...</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>0</td>\n      <td>None</td>\n      <td>None</td>\n      <td>CC BY-SA 2.5</td>\n      <td>469</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>502</td>\n      <td>1</td>\n      <td>7090.0</td>\n      <td>2008-08-02T17:01:58.500</td>\n      <td>58</td>\n      <td>17199</td>\n      <td>&lt;p&gt;I have a cross-platform (Python) applicatio...</td>\n      <td>147.0</td>\n      <td>9780149.0</td>\n      <td>Adam Mitz</td>\n      <td>...</td>\n      <td>Jeff Atwood</td>\n      <td>2015-08-29T23:20:48.107</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>0</td>\n      <td>None</td>\n      <td>None</td>\n      <td>CC BY-SA 3.0</td>\n      <td>502</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>535</td>\n      <td>1</td>\n      <td>541.0</td>\n      <td>2008-08-02T18:43:54.787</td>\n      <td>68</td>\n      <td>9540</td>\n      <td>&lt;p&gt;I am starting to work on a hobby project wi...</td>\n      <td>154.0</td>\n      <td>7232508.0</td>\n      <td>Robert Gamble</td>\n      <td>...</td>\n      <td>user1873471</td>\n      <td>2013-01-28T03:54:17.217</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>4</td>\n      <td>None</td>\n      <td>None</td>\n      <td>CC BY-SA 3.0</td>\n      <td>535</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>594</td>\n      <td>1</td>\n      <td>595.0</td>\n      <td>2008-08-03T01:15:08.507</td>\n      <td>55</td>\n      <td>57408</td>\n      <td>&lt;p&gt;There are several ways to iterate over a re...</td>\n      <td>116.0</td>\n      <td>116.0</td>\n      <td>Mark Harrison</td>\n      <td>...</td>\n      <td>None</td>\n      <td>2016-10-15T20:47:11.027</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>2</td>\n      <td>None</td>\n      <td>None</td>\n      <td>CC BY-SA 3.0</td>\n      <td>594</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>69995</th>\n      <td>53827601</td>\n      <td>1</td>\n      <td>53827671.0</td>\n      <td>2018-12-18T06:39:38.237</td>\n      <td>14</td>\n      <td>23334</td>\n      <td>&lt;p&gt;I am getting below error. Is there any way ...</td>\n      <td>8930395.0</td>\n      <td>8930395.0</td>\n      <td>None</td>\n      <td>...</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>1</td>\n      <td>None</td>\n      <td>None</td>\n      <td>CC BY-SA 4.0</td>\n      <td>53827601</td>\n    </tr>\n    <tr>\n      <th>69996</th>\n      <td>53829045</td>\n      <td>1</td>\n      <td>53829145.0</td>\n      <td>2018-12-18T08:32:12.813</td>\n      <td>14</td>\n      <td>1351</td>\n      <td>&lt;pre&gt;&lt;code&gt;variable=\";CREATEDBY~string~1~~72~0...</td>\n      <td>2996372.0</td>\n      <td>63550.0</td>\n      <td>None</td>\n      <td>...</td>\n      <td>None</td>\n      <td>2018-12-18T09:33:34.967</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>6</td>\n      <td>None</td>\n      <td>None</td>\n      <td>CC BY-SA 4.0</td>\n      <td>53829045</td>\n    </tr>\n    <tr>\n      <th>69997</th>\n      <td>53829896</td>\n      <td>1</td>\n      <td>53829925.0</td>\n      <td>2018-12-18T09:26:32.470</td>\n      <td>11</td>\n      <td>63713</td>\n      <td>&lt;p&gt;I'm trying with the code from link below to...</td>\n      <td>8410477.0</td>\n      <td>NaN</td>\n      <td>None</td>\n      <td>...</td>\n      <td>None</td>\n      <td>2018-12-18T09:31:48.723</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>0</td>\n      <td>None</td>\n      <td>None</td>\n      <td>CC BY-SA 4.0</td>\n      <td>53829896</td>\n    </tr>\n    <tr>\n      <th>69998</th>\n      <td>53830081</td>\n      <td>1</td>\n      <td>53830333.0</td>\n      <td>2018-12-18T09:37:15.270</td>\n      <td>12</td>\n      <td>97349</td>\n      <td>&lt;p&gt;I am currently trying to compare values fro...</td>\n      <td>10805303.0</td>\n      <td>9698684.0</td>\n      <td>None</td>\n      <td>...</td>\n      <td>None</td>\n      <td>2020-05-20T13:51:35.490</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>0</td>\n      <td>None</td>\n      <td>None</td>\n      <td>CC BY-SA 4.0</td>\n      <td>53830081</td>\n    </tr>\n    <tr>\n      <th>69999</th>\n      <td>53832607</td>\n      <td>1</td>\n      <td>NaN</td>\n      <td>2018-12-18T12:03:57.233</td>\n      <td>11</td>\n      <td>30479</td>\n      <td>&lt;p&gt;I have a list:&lt;/p&gt;\\n&lt;pre&gt;&lt;code&gt;ueid_list = ...</td>\n      <td>10779037.0</td>\n      <td>63550.0</td>\n      <td>None</td>\n      <td>...</td>\n      <td>None</td>\n      <td>2021-07-20T11:07:51.030</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>0</td>\n      <td>None</td>\n      <td>None</td>\n      <td>CC BY-SA 4.0</td>\n      <td>53832607</td>\n    </tr>\n  </tbody>\n</table>\n<p>70000 rows × 38 columns</p>\n</div>"
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import pandas as pd\n",
-    "\n",
-    "valid_questions = pd.read_sql_query(f\"\"\"\n",
-    "        SELECT * FROM Post Q\n",
-    "        INNER JOIN Post A ON Q.PostId = A.ParentId\n",
-    "        WHERE (Q.Tags LIKE '%<python>%')\n",
-    "        GROUP BY A.ParentId\n",
-    "        HAVING SUM(A.Score) > 15\n",
-    "        LIMIT {QUESTIONS_RETREIVED}\n",
-    "\"\"\", db)\n",
-    "#valid_questions.set_index('PostId', inplace=True)\n",
-    "valid_questions"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "outputs": [],
-   "source": [],
-   "metadata": {
-    "collapsed": false
-   }
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 2
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
-%% Cell type:code id: tags:
-
-``` python
-import sqlite3
-
-# Create your connection.
-db = sqlite3.connect('../stackoverflow.db')
-```
-
-%% Cell type:code id: tags:
-
-``` python
-QUESTIONS_RETREIVED = 70000
-```
-
-%% Cell type:code id: tags:
-
-``` python
-import pandas as pd
-
-valid_questions = pd.read_sql_query(f"""
-        SELECT * FROM Post Q
-        INNER JOIN Post A ON Q.PostId = A.ParentId
-        WHERE (Q.Tags LIKE '%<python>%')
-        GROUP BY A.ParentId
-        HAVING SUM(A.Score) > 15
-        LIMIT {QUESTIONS_RETREIVED}
-""", db)
-#valid_questions.set_index('PostId', inplace=True)
-valid_questions
-```
-
-%% Output
-
-         PostId  PostTypeId  AcceptedAnswerId             CreationDate  Score  \
-0           337           1             342.0  2008-08-02T03:35:55.697     82   
-1           469           1            3040.0  2008-08-02T15:11:16.430     47   
-2           502           1            7090.0  2008-08-02T17:01:58.500     58   
-3           535           1             541.0  2008-08-02T18:43:54.787     68   
-4           594           1             595.0  2008-08-03T01:15:08.507     55   
-...         ...         ...               ...                      ...    ...   
-69995  53827601           1        53827671.0  2018-12-18T06:39:38.237     14   
-69996  53829045           1        53829145.0  2018-12-18T08:32:12.813     14   
-69997  53829896           1        53829925.0  2018-12-18T09:26:32.470     11   
-69998  53830081           1        53830333.0  2018-12-18T09:37:15.270     12   
-69999  53832607           1               NaN  2018-12-18T12:03:57.233     11   
-
-       ViewCount                                               Body  \
-0          10098  <p>I am about to build a piece of a project th...   
-1           4372  <p>I am using the Photoshop's javascript API t...   
-2          17199  <p>I have a cross-platform (Python) applicatio...   
-3           9540  <p>I am starting to work on a hobby project wi...   
-4          57408  <p>There are several ways to iterate over a re...   
-...          ...                                                ...   
-69995      23334  <p>I am getting below error. Is there any way ...   
-69996       1351  <pre><code>variable=";CREATEDBY~string~1~~72~0...   
-69997      63713  <p>I'm trying with the code from link below to...   
-69998      97349  <p>I am currently trying to compare values fro...   
-69999      30479  <p>I have a list:</p>\n<pre><code>ueid_list = ...   
-
-       OwnerUserId  LastEditorUserId LastEditorDisplayName  ...  \
-0            111.0         2336654.0                  None  ...   
-1            147.0         1997093.0                   Ash  ...   
-2            147.0         9780149.0             Adam Mitz  ...   
-3            154.0         7232508.0         Robert Gamble  ...   
-4            116.0             116.0         Mark Harrison  ...   
-...            ...               ...                   ...  ...   
-69995    8930395.0         8930395.0                  None  ...   
-69996    2996372.0           63550.0                  None  ...   
-69997    8410477.0               NaN                  None  ...   
-69998   10805303.0         9698684.0                  None  ...   
-69999   10779037.0           63550.0                  None  ...   
-
-      LastEditorDisplayName             LastEditDate Title  Tags  AnswerCount  \
-0                      None                     None  None  None         None   
-1                      None                     None  None  None         None   
-2               Jeff Atwood  2015-08-29T23:20:48.107  None  None         None   
-3               user1873471  2013-01-28T03:54:17.217  None  None         None   
-4                      None  2016-10-15T20:47:11.027  None  None         None   
-...                     ...                      ...   ...   ...          ...   
-69995                  None                     None  None  None         None   
-69996                  None  2018-12-18T09:33:34.967  None  None         None   
-69997                  None  2018-12-18T09:31:48.723  None  None         None   
-69998                  None  2020-05-20T13:51:35.490  None  None         None   
-69999                  None  2021-07-20T11:07:51.030  None  None         None   
-
-       CommentCount FavoriteCount CommunityOwnedDate ContentLicense  ParentId  
-0                 1          None               None   CC BY-SA 2.5       337  
-1                 0          None               None   CC BY-SA 2.5       469  
-2                 0          None               None   CC BY-SA 3.0       502  
-3                 4          None               None   CC BY-SA 3.0       535  
-4                 2          None               None   CC BY-SA 3.0       594  
-...             ...           ...                ...            ...       ...  
-69995             1          None               None   CC BY-SA 4.0  53827601  
-69996             6          None               None   CC BY-SA 4.0  53829045  
-69997             0          None               None   CC BY-SA 4.0  53829896  
-69998             0          None               None   CC BY-SA 4.0  53830081  
-69999             0          None               None   CC BY-SA 4.0  53832607  
-
-[70000 rows x 38 columns]
-
-%% Cell type:code id: tags:
-
-``` python
-```