implementing heterogenous GNN manually

6b2cb6df · Liam Byrne · 2e26cfaf · 6b2cb6df · 6b2cb6df · 6b2cb6df
Commit 6b2cb6df authored Feb 13, 2023 by Liam Byrne
--- a/embeddings/GAT.py
+++ b/embeddings/GAT.py
@@ -12,7 +12,8 @@ class GAT(torch.nn.Module):
        self.conv1 = GATConv((-1, -1), hidden_channels, add_self_loops=False)
        self.conv2 = GATConv((-1, -1), hidden_channels, add_self_loops=False)
        self.conv3 = GATConv((-1, -1), hidden_channels, add_self_loops=False)
-        self.lin = Linear(hidden_channels, 3)
+        self.lin = Linear(hidden_channels, 2)
+        self.softmax = torch.nn.Softmax(dim=1)
        self.pool = MeanAggregation()
    def forward(self, x, edge_index, batch, post_emb):
@@ -30,7 +31,7 @@ class GAT(torch.nn.Module):
        # 3. Apply a final classifier
        #x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
+        #x = self.softmax(x)
        return x
@@ -38,8 +39,11 @@ def train(model, train_loader):
    model.train()
    for data in train_loader:  # Iterate in batches over the training dataset.
-         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
+        print(data)
-         loss = criterion(out, data.y)  # Compute the loss.
+        out = model(data.x_dict, data.edge_index_dict, data.batch_dict, torch.concat([data.question_emb, data.answer_emb]))  # Perform a single forward pass.
+        print(out, data.label)
+        loss = criterion(out, data.label)  # Compute the loss.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        optimizer.zero_grad()  # Clear gradients.
@@ -57,25 +61,25 @@ def test(loader):
 if __name__ == '__main__':
-    dataset = UserGraphDataset(root="data")
+    dataset = UserGraphDataset(root="../data", skip_processing=True)
    train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [0.6, 0.1, 0.3])
-    train_loader = DataLoader(train_dataset, batch_size=64)
+    print(dataset.num_node_features)
+    train_loader = DataLoader(train_dataset, batch_size=1)
    val_loader = DataLoader(val_dataset, batch_size=64)
    test_loader = DataLoader(test_dataset, batch_size=64)
    model = GAT(hidden_channels=64)
+    sample = train_dataset[0]
-    example_graph = train_loader[0]
+    metadata = (['question', 'answer', 'comment', 'tag', 'module'], [('tag', 'describes', 'question'), ('tag', 'describes', 'answer'), ('tag', 'describes', 'comment'), ('module', 'imported_in', 'question'), ('module', 'imported_in', 'answer'), ('question', 'rev_describes', 'tag'), ('answer', 'rev_describes', 'tag'), ('comment', 'rev_describes', 'tag'), ('question', 'rev_imported_in', 'module'), ('answer', 'rev_imported_in', 'module')])
-    print(example_graph)
+    model = to_hetero(model, metadata, aggr='sum')
-    model = to_hetero(model, example_graph.metadata())
+    #print(model(sample.x_dict, sample.edge_index_dict, sample.batch_dict))
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.CrossEntropyLoss()
    for epoch in range(1, 10):
        train(model, train_loader)
-        train_acc = test(train_loader)
+        #train_acc = test(train_loader)
-        print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')
+        #print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')
--- a/embeddings/__pycache__/dataset.cpython-38.pyc
+++ b/embeddings/__pycache__/dataset.cpython-38.pyc
--- a/embeddings/__pycache__/post_embedding_builder.cpython-38.pyc
+++ b/embeddings/__pycache__/post_embedding_builder.cpython-38.pyc
--- a/embeddings/__pycache__/static_graph_construction.cpython-38.pyc
+++ b/embeddings/__pycache__/static_graph_construction.cpython-38.pyc
--- a/embeddings/dataset.py
+++ b/embeddings/dataset.py
 import logging
 import os.path
 import pickle
+import re
 import sqlite3
+from typing import List
 import pandas as pd
 import torch
 from bs4 import MarkupResemblesLocatorWarning
-from torch_geometric.data import Dataset, download_url, Data
+from torch_geometric.data import Dataset, download_url, Data, HeteroData
 from tqdm import tqdm
 import warnings
 warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning)
@@ -20,12 +22,14 @@ log = logging.getLogger("dataset")
 class UserGraphDataset(Dataset):
-    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None, db_address:str=None, question_count=70000):
+    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None, db_address:str=None, skip_processing=False):
-        self._question_count = question_count
+        self._skip_processing = skip_processing
        # Connect to database.
        if db_address is not None:
            self._db = sqlite3.connect(db_address)
            self._post_embedding_builder = PostEmbedding()
+        # Call init last, as it may trigger the process function.
        super().__init__(root, transform, pre_transform, pre_filter)
    @property
@@ -34,63 +38,94 @@ class UserGraphDataset(Dataset):
    @property
    def processed_file_names(self):
-        return os.listdir("data/processed")
+        if self._skip_processing:
+            return os.listdir("../data/processed")
+        return []
    def download(self):
        pass
+    def get_unprocessed_ids(self):
+        # Load IDs of questions to use.
+        with open("../data/raw/valid_questions.pkl", 'rb') as f:
+            question_ids = pickle.load(f)
+        processed = []
+        max_idx = -1
+        for f in os.listdir("../data/processed"):
+            question_id_search = re.search(r"id_(\d+)", f)
+            if question_id_search:
+                processed.append(int(question_id_search.group(1)))
+            idx_search = re.search(r"data_(\d+)", f)
+            if idx_search:
+                next_idx = int(idx_search.group(1))
+                if next_idx > max_idx:
+                    max_idx = int(idx_search.group(1))
+        # Fetch question ids that have not been processed yet.
+        unprocessed = [q_id for q_id in question_ids if q_id not in processed]
+        return unprocessed, max_idx+1
    def process(self):
-        idx = 0
+        """
-        valid_questions = self.fetch_valid_questions()
+        """
+        log.info("Processing data...")
+        # Fetch the unprocessed questions and the next index to use.
+        unprocessed, idx = self.get_unprocessed_ids()
+        print(unprocessed, idx)
+        # Fetch questions from database.
+        valid_questions = self.fetch_questions_by_post_ids(unprocessed)
        for row in tqdm(valid_questions.itertuples(), total=len(valid_questions)):
            # Build Question embedding
-            question_emb = self._post_embedding_builder(
+            question_word_emb, question_code_emb, _ = self._post_embedding_builder(
                row.question_body,
                use_bert=True,
                title=row.question_title
            )
+            question_emb = torch.concat((question_word_emb, question_code_emb))
+            # Fetch answers to question
            answers_to_question = self.fetch_answers_for_question(row.post_id)
            # Build Answer embeddings
            for _, answer_body, answer_user_id, score in answers_to_question.itertuples():
-                answer_emb = self._post_embedding_builder(
+                label = torch.tensor([1 if score > 0 else 0], dtype=torch.long)
-                    answer_body,
+                answer_word_emb, answer_code_emb, _ = self._post_embedding_builder(
-                    use_bert=True
+                    answer_body, use_bert=True
                )
+                answer_emb = torch.concat((answer_word_emb, answer_code_emb))
                # Build graph
-                graph = self.construct_graph(answer_user_id)
+                graph: HeteroData = self.construct_graph(answer_user_id)
                # pytorch geometric data object
-                data = Data(
+                graph.__setattr__('question_emb', question_emb)
-                    x=graph.x_dict,
+                graph.__setattr__('answer_emb', answer_emb)
-                    edge_index=graph.edge_index_dict,
+                graph.__setattr__('label', label)
-                    y=torch.LongTensor(1 if score > 0 else 0),
+                torch.save(graph, os.path.join(self.processed_dir, f'data_{idx}_question_id_{row.post_id}'))
-                    question_emb=question_emb,
-                    answer_emb=answer_emb
-                )
-                torch.save(data, os.path.join(self.processed_dir, f'data_{idx}.pt'))
                idx += 1
    def len(self):
        return len(self.processed_file_names)
    def get(self, idx):
-        data = torch.load(os.path.join(self.processed_dir, f'data_{idx}.pt'))
+        file_name = [filename for filename in os.listdir('../data/processed/') if filename.startswith(f"data_{idx}")]
+        if len(file_name):
+            data = torch.load(os.path.join(self.processed_dir, file_name[0]))
            return data
+        else:
+            raise Exception(f"Data with index {idx} not found.")
    '''
    Database functions
    '''
-    def fetch_valid_questions(self):
+    def fetch_questions_by_post_ids(self, post_ids: List[int]):
-        valid_questions = pd.read_sql_query(f"""
+        questions_df = pd.read_sql_query(f"""
-                SELECT Q.PostId, Q.Body, Q.Title, Q.OwnerUserId FROM Post Q
+                SELECT PostId, Body, Title, OwnerUserId FROM Post
-                INNER JOIN Post A ON Q.PostId = A.ParentId
+                WHERE PostId IN ({','.join([str(x) for x in post_ids])})
-                WHERE (Q.Tags LIKE '%<python>%')
-                GROUP BY A.ParentId
-                HAVING SUM(A.Score) > 15
-                LIMIT {self._question_count}
        """, self._db)
-        valid_questions.columns = ['post_id', 'question_body', 'question_title', 'question_user_id']
+        questions_df.columns = ['post_id', 'question_body', 'question_title', 'question_user_id']
-        return valid_questions
+        return questions_df
    def fetch_questions_by_user(self, user_id: int):
        questions_df = pd.read_sql_query(f"""
@@ -150,5 +185,18 @@ class UserGraphDataset(Dataset):
 if __name__ == '__main__':
-    ds = UserGraphDataset('../data/', db_address='../stackoverflow.db', question_count=100)
+    '''
-    print(ds.get(0))
+    Build List of question post_ids.
\ No newline at end of file
+    This will be fed into the Dataset class to construct the graphs.
+    This setup allows us to have a fixed set of questions/answers
+    for each dataset (rather than selecting new questions each time).
+    '''
+    ds = UserGraphDataset('../data/', db_address='../stackoverflow.db', skip_processing=False)
+    data = ds.get(1)
+    print("Question ndim:", data.x_dict['question'].dim())
+    print("Answer ndim:", data.x_dict['answer'].dim())
+    print("Comment ndim:", data.x_dict['comment'].dim())
+    print("Tag ndim:", data.x_dict['tag'].dim())
+    print("Module ndim:", data.x_dict['module'].dim())
\ No newline at end of file
--- a/embeddings/post_embedding_builder.py
+++ b/embeddings/post_embedding_builder.py
@@ -113,14 +113,14 @@ class PostEmbedding(nn.Module):
        return torch.sum(word_embeddings, dim=0) / len(tokens)
    def to_bert_embedding(self, text: str) -> torch.tensor:
-        sentences = [i.text for i in self._en(text).sents]
+        # if not len(text):
-        if not len(sentences):
+        #     return torch.zeros(768)
-            return torch.zeros(768)
+        encodings = self._bert_tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=512)
-        encodings = self._bert_tokenizer(sentences, padding=True, truncation=True, return_tensors='pt', max_length=512)
        with torch.no_grad():
-            outputs = self._bert_model(**encodings, output_hidden_states=True)
+            outputs = self._bert_model(**encodings)
-            cls = outputs.hidden_states[-1][0,0,:]
+            last_layer = outputs.last_hidden_state
-        return cls
+            cls = last_layer[:, 0, :]
+            return torch.squeeze(cls) # Converts from dim [1, 768] to [768]
    def to_code_bert_embedding(self, code):
@@ -215,5 +215,5 @@ class PostEmbedding(nn.Module):
 if __name__ == '__main__':
    pe = PostEmbedding()
    #print(pe.to_code_bert_embedding("\n".join(["for i in range(32):\n    #return 6 or something\n"])).shape)
-    print(pe.to_bert_embedding("This is a test sentence."))
+    print(pe.to_bert_embedding("This is a test sentence.").shape)
    #print([x.module for x in pe.get_imports_via_regex(BeautifulSoup("<code>import ast<\code>", 'lxml'))])
--- a/embeddings/pyg_construction_demo.ipynb
+++ b/embeddings/pyg_construction_demo.ipynb
--- a/embeddings/static_graph_construction.py
+++ b/embeddings/static_graph_construction.py
@@ -9,6 +9,7 @@ import torch
 from torch_geometric.data import HeteroData
 from post_embedding_builder import Import, PostEmbedding
+import torch_geometric.transforms as T
 class StaticGraphConstruction:
@@ -75,7 +76,7 @@ class StaticGraphConstruction:
            for module in modules:
                self._module_to_comment_edges.append((self._known_modules[module], i))
-            yield torch.concat((word_embedding, code_embedding))
+            yield word_embedding
    def process_tags(self):
        if not len(self._known_tags):
@@ -121,21 +122,27 @@ class StaticGraphConstruction:
        tag_nodes = list(self.process_tags())
        module_nodes = list(self.process_modules())
-        if len(question_nodes):
+        # Assign node features
-            self._data['question'].x = torch.stack(question_nodes)
+        self._data['question'].x = torch.stack(question_nodes) if len(question_nodes) else torch.empty(0, 1536)
-        if len(answer_nodes):
-            self._data['answer'].x = torch.stack(answer_nodes)
+        self._data['answer'].x = torch.stack(answer_nodes) if len(answer_nodes) else torch.empty(0, 1536)
-        if len(comment_nodes):
-            self._data['comment'].x = torch.stack(comment_nodes)
+        self._data['comment'].x = torch.stack(comment_nodes) if len(comment_nodes) else torch.empty(0, 768)
-        if len(tag_nodes):
-            self._data['tag'].x = torch.stack(tag_nodes)
+        self._data['tag'].x = torch.stack(tag_nodes) if len(tag_nodes) else torch.empty(0, 90)
-        if len(module_nodes):
-            self._data['module'].x = torch.stack(module_nodes)
+        self._data['module'].x = torch.stack(module_nodes) if len(module_nodes) else torch.empty(0, 110)
-        self._data['tag', 'describes', 'question'].edge_index = torch.tensor(self._tag_to_question_edges).t().contiguous()
+        # Assign edge indexes
-        self._data['tag', 'describes', 'answer'].edge_index = torch.tensor(self._tag_to_answer_edges).t().contiguous()
+        self._data['tag', 'describes', 'question'].edge_index = torch.tensor(self._tag_to_question_edges).t().contiguous() if len(self._tag_to_question_edges) else torch.empty(2,0, dtype=torch.long)
-        self._data['tag', 'describes', 'comment'].edge_index = torch.tensor(self._tag_to_comment_edges).t().contiguous()
+        self._data['tag', 'describes', 'answer'].edge_index = torch.tensor(self._tag_to_answer_edges).t().contiguous() if len(self._tag_to_answer_edges) else torch.empty(2,0, dtype=torch.long)
-        self._data['module', 'imported_in', 'question'].edge_index = torch.tensor(self._module_to_question_edges).t().contiguous()
+        self._data['tag', 'describes', 'comment'].edge_index = torch.tensor(self._tag_to_comment_edges).t().contiguous() if len(self._tag_to_comment_edges) else torch.empty(2,0, dtype=torch.long)
-        self._data['module', 'imported_in', 'answer'].edge_index = torch.tensor(self._module_to_answer_edges).t().contiguous()
+        self._data['module', 'imported_in', 'question'].edge_index = torch.tensor(self._module_to_question_edges).t().contiguous() if len(self._module_to_question_edges) else torch.empty(2,0, dtype=torch.long)
+        self._data['module', 'imported_in', 'answer'].edge_index = torch.tensor(self._module_to_answer_edges).t().contiguous() if len(self._module_to_answer_edges) else torch.empty(2,0, dtype=torch.long)
-        return self._data
+        # Remove isolated nodes, and convert to undirected graph
+        graph_out = T.remove_isolated_nodes.RemoveIsolatedNodes()(self._data)
+        graph_out = T.ToUndirected()(graph_out)
+        graph_out.metadata()
+        return graph_out
--- a/embeddings/training_set_builder.ipynb
+++ b/embeddings/training_set_builder.ipynb
@@ -11,6 +11,7 @@
    "import sqlite3\n",
    "import pandas as pd\n",
    "import logging\n",
+    "import pickle\n",
    "logging.basicConfig()\n",
    "logging.getLogger().setLevel(logging.INFO)\n",
    "log = logging.getLogger(\"training-set-builder\")\n",
@@ -60,6 +61,20 @@
    "collapsed": false
   }
  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "outputs": [],
+   "source": [
+    "save = True\n",
+    "if save:\n",
+    "    with open(\"../data/raw/valid_questions.pkl\", \"wb\") as f:\n",
+    "        pickle.dump(valid_questions['post_id'].to_list(), f)"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
  {
   "cell_type": "code",
   "execution_count": 4,
@@ -373,6 +388,7 @@
    }
   ],
   "source": [
+    "import torch\n",
    "from datetime import date\n",
    "from post_embedding_builder import PostEmbedding\n",
    "from static_graph_construction import StaticGraphConstruction\n",
@@ -390,16 +406,25 @@
    "for row in valid_questions.itertuples():\n",
    "    log.info(f\"processing question {question_c}\")\n",
    "    # Build Question embedding\n",
-    "    question_emb = post_embedding_builder(row.question_body, use_bert=True, title=row.question_title)\n",
+    "    question_word_emb, question_code_emb, _ = post_embedding_builder(\n",
+    "        row.question_body,\n",
+    "        use_bert=True,\n",
+    "        title=row.question_title\n",
+    "    )\n",
+    "    question_emb = torch.concat((question_word_emb, question_code_emb))\n",
    "    # Build Answer embeddings\n",
    "    for _, answer_body, answer_user_id, score in fetch_answers_for_question(row.post_id, db).itertuples():\n",
    "        log.info(f\"processing answer {answer_c}\")\n",
-    "        ans_emb = post_embedding_builder(answer_body, use_bert=True)\n",
+    "        answer_word_emb, answer_code_emb, _  = post_embedding_builder(\n",
+    "            answer_body, use_bert=True\n",
+    "        )\n",
+    "        answer_emb = torch.concat((answer_word_emb, answer_code_emb))\n",
    "        # Construct User Graph\n",
    "        ug = construct_graph(answer_user_id, db)\n",
+    "\n",
    "        data[\"graphs\"].append(ug)\n",
    "        data[\"questions\"].append(question_emb)\n",
-    "        data[\"answers\"].append(ans_emb)\n",
+    "        data[\"answers\"].append(answer_emb)\n",
    "        data[\"labels\"].append(1 if score > 0 else 0)\n",
    "\n",
    "        # Dataset will grow larger than memory, so batch in pickle files\n",

 %% Cell type:code id: tags:
 ``` python
 import sqlite3
 import pandas as pd
 import logging
+import pickle
 logging.basicConfig()
 logging.getLogger().setLevel(logging.INFO)
 log = logging.getLogger("training-set-builder")
 # Create your connection.
 db = sqlite3.connect('../stackoverflow.db')
 ```
 %% Cell type:code id: tags:
 ``` python
 QUESTIONS_RETREIVED = 12
 ```
 %% Cell type:code id: tags:
 ``` python
 valid_questions = pd.read_sql_query(f"""
        SELECT Q.PostId, Q.Body, Q.Title, Q.OwnerUserId FROM Post Q
        INNER JOIN Post A ON Q.PostId = A.ParentId
        WHERE (Q.Tags LIKE '%<python>%')
        GROUP BY A.ParentId
        HAVING SUM(A.Score) > 15
        LIMIT {QUESTIONS_RETREIVED}
 """, db)
 valid_questions.columns = ['post_id', 'question_body', 'question_title', 'question_user_id']
 valid_questions
 ```
 %% Output
    post_id                                      question_body  \
 0       337  <p>I am about to build a piece of a project th...   
 1       469  <p>I am using the Photoshop's javascript API t...   
 2       502  <p>I have a cross-platform (Python) applicatio...   
 3       535  <p>I am starting to work on a hobby project wi...   
 4       594  <p>There are several ways to iterate over a re...   
 5       683  <p>I don't remember whether I was dreaming or ...   
 6       742  <p><a href="http://www.djangoproject.com/" rel...   
 7       766  <p>I can get Python to work with Postgresql bu...   
 8       773  <p>I haven't been able to find an understandab...   
 9       972  <p>I've read that it is possible to add a meth...   
 10     1171  <p>I need to be able to manipulate a large (10...   
 11     1476  <p>How do you express an integer as a binary n...   
                                       question_title  question_user_id  
 0                            XML Processing in Python               111  
 1   How can I find the full path to a font from it...               147  
 2             Get a preview JPEG of a PDF on Windows?               147  
 3   Continuous Integration System for a Python Cod...               154  
 4      cx_Oracle: How do I iterate over a result set?               116  
 5   Using 'in' to match an attribute of Python obj...               199  
 6                               Class views in Django               189  
 7                                    Python and MySQL           1384652  
 8                   How do I use itertools.groupby()?               207  
 9      Adding a method to an existing object instance               145  
 10  What is the most efficient graph data structur...               280  
 11      How do you express binary literals in Python?                92  
 %% Cell type:code id: tags:
 ``` python
+save = True
+if save:
+    with open("../data/raw/valid_questions.pkl", "wb") as f:
+        pickle.dump(valid_questions['post_id'].to_list(), f)
+```
+%% Cell type:code id: tags:
+``` python
 def fetch_questions_by_user(user_id: int, db):
    questions_df = pd.read_sql_query(f"""
            SELECT *
            FROM Post
            WHERE Tags LIKE '%python%' AND (PostTypeId = 1) AND ((LastEditorUserId = {user_id}) OR (OwnerUserId = {user_id}))
    """, db)
    questions_df.set_index('PostId', inplace=True)
    return questions_df
 ```
 %% Cell type:code id: tags:
 ``` python
 def fetch_answers_by_user(user_id: int, db):
    answers_df = pd.read_sql_query(f"""
            SELECT A.Tags, B.*
            FROM Post A
                INNER JOIN Post B ON (B.ParentId = A.PostId) AND (B.ParentId IS NOT NULL)
            WHERE A.Tags LIKE '%python%' AND (B.PostTypeId = 2) AND ((B.LastEditorUserId = {user_id}) OR (B.OwnerUserId = {user_id}))
    """, db)
    answers_df = answers_df.loc[:, ~answers_df.columns.duplicated()].copy()
    answers_df.set_index('PostId', inplace=True)
    return answers_df
 ```
 %% Cell type:code id: tags:
 ``` python
 def fetch_answers_for_question(question_post_id: int, db):
    answers_df = pd.read_sql_query(f"""
            SELECT Body, OwnerUserId, Score
            FROM Post
            WHERE ParentId = {question_post_id}
    """, db)
    answers_df = answers_df.dropna()
    return answers_df
 ```
 %% Cell type:code id: tags:
 ``` python
 def fetch_comments_by_user(user_id: int, db):
    comments_on_questions_df = pd.read_sql_query(f"""
            SELECT A.Tags, B.*
            FROM Post A
                INNER JOIN Comment B ON (B.PostId = A.PostId)
            WHERE A.Tags LIKE '%python%' AND (B.UserId = {user_id}) AND (A.PostTypeId = 1)
    """, db)
    comments_on_questions_df.set_index('CommentId', inplace=True)
    comments_on_answers_df = pd.read_sql_query(f"""
        SELECT A.Tags, C.*
        FROM Post A
            INNER JOIN Post B ON (B.ParentId = A.PostId) AND (B.ParentId IS NOT NULL)
            INNER JOIN Comment C ON (B.PostId = C.PostId)
        WHERE A.Tags LIKE '%python%' AND (C.UserId = {user_id}) AND (B.PostTypeId = 2)
    """, db)
    comments_on_answers_df.set_index('CommentId', inplace=True)
    return pd.concat([comments_on_questions_df, comments_on_answers_df])
 ```
 %% Cell type:code id: tags:
 ``` python
 def construct_graph(user_id, db):
    graph_constructor = StaticGraphConstruction()
    qs = fetch_questions_by_user(user_id, db)
    ans = fetch_answers_by_user(user_id, db)
    cs = fetch_comments_by_user(user_id, db)
    return graph_constructor.construct(questions=qs, answers=ans, comments=cs)
 ```
 %% Cell type:code id: tags:
 ``` python
+import torch
 from datetime import date
 from post_embedding_builder import PostEmbedding
 from static_graph_construction import StaticGraphConstruction
 import pickle
 post_embedding_builder = PostEmbedding()
 FILE_BATCH_SIZE = 10
 data = {"graphs":[], "questions": [], "answers": [], "labels": []}
 question_c = 1
 answer_c = 1
 for row in valid_questions.itertuples():
    log.info(f"processing question {question_c}")
    # Build Question embedding
-    question_emb = post_embedding_builder(row.question_body, use_bert=True, title=row.question_title)
+    question_word_emb, question_code_emb, _ = post_embedding_builder(
+        row.question_body,
+        use_bert=True,
+        title=row.question_title
+    )
+    question_emb = torch.concat((question_word_emb, question_code_emb))
    # Build Answer embeddings
    for _, answer_body, answer_user_id, score in fetch_answers_for_question(row.post_id, db).itertuples():
        log.info(f"processing answer {answer_c}")
-        ans_emb = post_embedding_builder(answer_body, use_bert=True)
+        answer_word_emb, answer_code_emb, _  = post_embedding_builder(
+            answer_body, use_bert=True
+        )
+        answer_emb = torch.concat((answer_word_emb, answer_code_emb))
        # Construct User Graph
        ug = construct_graph(answer_user_id, db)
        data["graphs"].append(ug)
        data["questions"].append(question_emb)
-        data["answers"].append(ans_emb)
+        data["answers"].append(answer_emb)
        data["labels"].append(1 if score > 0 else 0)
        # Dataset will grow larger than memory, so batch in pickle files
        with open(f"../data/raw/batch{date.today():%m-%d-%Y}.pkl", "wb") as f:
            pickle.dump(data, f)
            data = {"graphs":[], "questions": [], "answers": [], "labels": []}
        answer_c += 1
    question_c += 1
    answer_c = 1
 ```
 %% Output
    INFO:post_embedding_builder:PostEmbedding instantiated!
    INFO:torchtext.vocab.vectors:Loading vectors from .vector_cache\glove.840B.300d.txt.pt
    Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
    - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
    - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
    INFO:post_embedding_builder:PostEmbedding instantiated!
    INFO:torchtext.vocab.vectors:Loading vectors from .vector_cache\glove.840B.300d.txt.pt
    Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
    - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
    - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
    INFO:training-set-builder:processing question 1
    INFO:training-set-builder:processing answer 1
    INFO:training-set-builder:processing answer 2
    INFO:training-set-builder:processing answer 3
    INFO:training-set-builder:processing answer 4
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 5
    INFO:training-set-builder:processing answer 6
    INFO:training-set-builder:processing answer 7
    INFO:training-set-builder:processing answer 8
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 9
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 10
    INFO:training-set-builder:writing batch to file
    INFO:training-set-builder:processing answer 11
    Token indices sequence length is longer than the specified maximum sequence length for this model (1056 > 512). Running this sequence through the model will result in indexing errors
    INFO:training-set-builder:processing question 2
    INFO:training-set-builder:processing answer 1
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 2
    INFO:training-set-builder:processing answer 3
    INFO:training-set-builder:processing answer 4
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing question 3
    INFO:training-set-builder:processing answer 1
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 2
    INFO:training-set-builder:processing answer 3
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing question 4
    INFO:training-set-builder:processing answer 1
    INFO:training-set-builder:processing answer 2
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:404: MarkupResemblesLocatorWarning: The input looks more like a URL than markup. You may want to use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
      warnings.warn(
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:writing batch to file
    INFO:training-set-builder:processing answer 3
    INFO:training-set-builder:processing answer 4
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 5
    INFO:training-set-builder:processing answer 6
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing question 5
    INFO:training-set-builder:processing answer 1
    INFO:training-set-builder:processing answer 2
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 3
    INFO:training-set-builder:processing question 6
    INFO:training-set-builder:processing answer 1
    INFO:training-set-builder:processing answer 2
    INFO:training-set-builder:processing answer 3
    INFO:training-set-builder:writing batch to file
    INFO:training-set-builder:processing answer 4
    INFO:training-set-builder:processing answer 5
    INFO:training-set-builder:processing answer 6
    INFO:training-set-builder:processing answer 7
    INFO:training-set-builder:processing answer 8
    INFO:training-set-builder:processing question 7
    INFO:training-set-builder:processing answer 1
    INFO:training-set-builder:processing answer 2
    INFO:training-set-builder:processing answer 3
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 4
    INFO:training-set-builder:processing answer 5
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:writing batch to file
    INFO:training-set-builder:processing answer 6
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 7
    INFO:training-set-builder:processing answer 8
    INFO:training-set-builder:processing question 8
    INFO:training-set-builder:processing answer 1
    INFO:training-set-builder:processing answer 2
    INFO:training-set-builder:processing answer 3
    INFO:training-set-builder:processing answer 4
    INFO:training-set-builder:processing answer 5
    INFO:training-set-builder:processing answer 6
    INFO:training-set-builder:processing question 9
    INFO:training-set-builder:processing answer 1
    INFO:training-set-builder:writing batch to file
    INFO:training-set-builder:processing answer 2
    INFO:training-set-builder:processing answer 3
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 4
    INFO:training-set-builder:processing answer 5
    INFO:training-set-builder:processing answer 6
    INFO:training-set-builder:processing answer 7
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 8
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 9
    INFO:training-set-builder:processing answer 10
    INFO:training-set-builder:processing answer 11
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:writing batch to file
    INFO:training-set-builder:processing answer 12
    INFO:training-set-builder:processing answer 13
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 14
    INFO:training-set-builder:processing question 10
    INFO:training-set-builder:processing answer 1
    INFO:training-set-builder:processing answer 2
    INFO:training-set-builder:processing answer 3
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 4
    INFO:training-set-builder:processing answer 5
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 6
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 7
    Token indices sequence length is longer than the specified maximum sequence length for this model (879 > 512). Running this sequence through the model will result in indexing errors
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:writing batch to file
    INFO:training-set-builder:processing answer 8
    INFO:training-set-builder:processing answer 9
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:404: MarkupResemblesLocatorWarning: The input looks more like a URL than markup. You may want to use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 10
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 11
    INFO:training-set-builder:processing answer 12
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 13
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 14
    INFO:training-set-builder:processing answer 15
    INFO:training-set-builder:processing answer 16
    INFO:training-set-builder:processing answer 17
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:writing batch to file
    INFO:training-set-builder:processing answer 18
    INFO:training-set-builder:processing answer 19
    INFO:training-set-builder:processing question 11
    INFO:training-set-builder:processing answer 1
    INFO:training-set-builder:processing answer 2
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 3
    INFO:training-set-builder:processing answer 4
    INFO:training-set-builder:processing answer 5
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 6
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 7
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing question 12
    INFO:training-set-builder:processing answer 1
    INFO:training-set-builder:writing batch to file
    INFO:training-set-builder:processing answer 2
    INFO:training-set-builder:processing answer 3
    INFO:training-set-builder:processing answer 4
    INFO:training-set-builder:processing answer 5
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 6
    INFO:training-set-builder:processing answer 7
    C:\Users\liamb\Documents\graph4stackoverflow\venv\lib\site-packages\bs4\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
      warnings.warn(
    INFO:training-set-builder:processing answer 8
 %% Cell type:code id: tags:
 ``` python
 X_raw[0]
 ```
 %% Output
    ---------------------------------------------------------------------------
    NameError                                 Traceback (most recent call last)
 Cell     In [10], line 1
    ----> 1 X_raw[0]
    NameError: name 'X_raw' is not defined
 %% Cell type:code id: tags:
 ``` python
 X_raw[10]
 ```
 %% Cell type:code id: tags:
 ``` python
 ```