diff --git a/.gitignore b/.gitignore index 723ef36f4e4f32c4560383aa5987c575a30c6535..1a192ea1a341a08aae7411cf09df2d2bb0b63dc4 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ -.idea \ No newline at end of file +.idea +data/ +data-collection/out/ \ No newline at end of file diff --git a/data-collection/BadgesHandler.java b/data-collection/BadgesHandler.java new file mode 100644 index 0000000000000000000000000000000000000000..0074ce4010d6c058b2e63e51968bb8ab9fb6e9b4 --- /dev/null +++ b/data-collection/BadgesHandler.java @@ -0,0 +1,101 @@ +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; +import java.io.IOException; + +class BadgeRow { + private String Id; + private String UserId; + private String Name; + private String Date; + private String BadgeClass; + private String TagBased; + + public BadgeRow(String id, String userId, String name, String date, String badgeClass, String tagBased) { + Id = id; + UserId = userId; + Name = name; + Date = date; + BadgeClass = badgeClass; + TagBased = tagBased; + } + + public String getId() { + return Id; + } + + public String getUserId() { + return UserId; + } + + public String getName() { + return Name; + } + + public String getDate() { + return Date; + } + + public String getBadgeClass() { + return BadgeClass; + } + + public String getTagBased() { + return TagBased; + } + + @Override + public String toString() { + return "BadgeRow{" + + "Id='" + Id + '\'' + + ", UserId='" + UserId + '\'' + + ", Name='" + Name + '\'' + + ", Date='" + Date + '\'' + + ", BadgeClass='" + BadgeClass + '\'' + + ", TagBased='" + TagBased + '\'' + + '}'; + } +} + +public class BadgesHandler extends DefaultHandler { + private static final String ROW = "row"; + private static final String BADGES = "badges"; + + @Override + public void startElement(String uri, String lName, String qName, Attributes attr) throws SAXException { + if (qName.equals(ROW)) { + BadgeRow badgeRow = new BadgeRow( + attr.getValue("Id"), + attr.getValue("UserId"), + attr.getValue("Name"), + attr.getValue("Date"), + attr.getValue("Class"), + attr.getValue("TagBased") + ); + //System.out.println(badgeRow); + + } else if (!qName.equals(BADGES)) { + throw new SAXException(String.format("Unknown tag %s", qName)); + } + + } + + public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException { + final long start = System.currentTimeMillis(); + System.out.println(start); + SAXParserFactory factory = SAXParserFactory.newInstance(); + SAXParser saxParser = factory.newSAXParser(); + BadgesHandler handler = new BadgesHandler(); + saxParser.parse( + "..\\data\\raw\\stackoverflow.com-Badges\\badges.xml", handler); + + + // time end + final long durationInMilliseconds = System.currentTimeMillis()-start; + System.out.println("SAX parse took " + durationInMilliseconds + "ms."); + } +} diff --git a/data-collection/CommentsHandler.java b/data-collection/CommentsHandler.java new file mode 100644 index 0000000000000000000000000000000000000000..6c038ee4cd6f2ce9f6526e6a452d4a7f6e860782 --- /dev/null +++ b/data-collection/CommentsHandler.java @@ -0,0 +1,112 @@ +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +class CommentRow { + private String id; + private String postId; + private String score; + private String text; + private String creationDate; + private String userId; + private String contentLicense; + + public CommentRow(String id, String postId, String score, String text, + String creationDate, String userId, String contentLicense) { + this.id = id; + this.postId = postId; + this.score = score; + this.text = text; + this.creationDate = creationDate; + this.userId = userId; + this.contentLicense = contentLicense; + } + + public String getId() { + return id; + } + + public String getPostId() { + return postId; + } + + public String getScore() { + return score; + } + + public String getText() { + return text; + } + + public String getCreationDate() { + return creationDate; + } + + public String getUserId() { + return userId; + } + + public String getContentLicense() { + return contentLicense; + } + + @Override + public String toString() { + return "CommentRow{" + + "id='" + id + '\'' + + ", postId='" + postId + '\'' + + ", score='" + score + '\'' + + ", text='" + text + '\'' + + ", creationDate='" + creationDate + '\'' + + ", userId='" + userId + '\'' + + ", contentLicense='" + contentLicense + '\'' + + '}'; + } +} +public class CommentsHandler extends DefaultHandler { + private static final String ROW = "row"; + private static final String COMMENTS = "comments"; + private StringBuilder elementValue; + + @Override + public void startElement(String uri, String lName, String qName, Attributes attr) throws SAXException { + if (qName.equals(ROW)) { + CommentRow commentRow = new CommentRow( + attr.getValue("Id"), + attr.getValue("PostId"), + attr.getValue("Score"), + attr.getValue("Text"), + attr.getValue("CreationDate"), + attr.getValue("UserId"), + attr.getValue("ContentLicense") + ); + System.out.println(commentRow); + + } else if (!qName.equals(COMMENTS)) { + throw new SAXException(String.format("Unknown tag %s", qName)); + } + + } + + public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException { + final long start = System.currentTimeMillis(); + + SAXParserFactory factory = SAXParserFactory.newInstance(); + SAXParser saxParser = factory.newSAXParser(); + CommentsHandler handler = new CommentsHandler(); + saxParser.parse( + "..\\data\\raw\\stackoverflow.com-Comments\\comments.xml", handler); + + + // time end + final long durationInMilliseconds = System.currentTimeMillis()-start; + System.out.println("SAX parse took " + durationInMilliseconds + "ms."); + } +} diff --git a/data-collection/PostsHandler.java b/data-collection/PostsHandler.java new file mode 100644 index 0000000000000000000000000000000000000000..47b7063534e7f845771c02577b0030d58d03007f --- /dev/null +++ b/data-collection/PostsHandler.java @@ -0,0 +1,206 @@ +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import javax.xml.XMLConstants; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; +import java.io.IOException; + +class PostRow { + private String Id; + private String PostTypeId; + private String AcceptedAnswerId; + private String CreationDate; + private String Score; + private String ViewCount; + private String Body; + private String OwnerUserId; + private String LastEditorUserId; + private String LastEditorDisplayName; + private String LastEditDate; + private String Title; + private String Tags; + private String AnswerCount; + private String CommentCount; + private String FavoriteCount; + private String CommunityOwnedDate; + private String ContentLicense; + + public PostRow(String id, String postTypeId, String acceptedAnswerId, + String creationDate, String score, String viewCount, + String body, String ownerUserId, String lastEditorUserId, + String lastEditorDisplayName, String lastEditDate, + String title, String tags, String answerCount, + String commentCount, String favoriteCount, + String communityOwnedDate, String contentLicense) { + Id = id; + PostTypeId = postTypeId; + AcceptedAnswerId = acceptedAnswerId; + CreationDate = creationDate; + Score = score; + ViewCount = viewCount; + Body = body; + OwnerUserId = ownerUserId; + LastEditorUserId = lastEditorUserId; + LastEditorDisplayName = lastEditorDisplayName; + LastEditDate = lastEditDate; + Title = title; + Tags = tags; + AnswerCount = answerCount; + CommentCount = commentCount; + FavoriteCount = favoriteCount; + CommunityOwnedDate = communityOwnedDate; + ContentLicense = contentLicense; + } + + @Override + public String toString() { + return "PostRow{" + + "Id='" + Id + '\'' + + ", PostTypeId='" + PostTypeId + '\'' + + ", AcceptedAnswerId='" + AcceptedAnswerId + '\'' + + ", CreationDate='" + CreationDate + '\'' + + ", Score='" + Score + '\'' + + ", ViewCount='" + ViewCount + '\'' + + ", Body='" + Body + '\'' + + ", OwnerUserId='" + OwnerUserId + '\'' + + ", LastEditorUserId='" + LastEditorUserId + '\'' + + ", LastEditorDisplayName='" + LastEditorDisplayName + '\'' + + ", LastEditDate='" + LastEditDate + '\'' + + ", Title='" + Title + '\'' + + ", Tags='" + Tags + '\'' + + ", AnswerCount='" + AnswerCount + '\'' + + ", CommentCount='" + CommentCount + '\'' + + ", FavoriteCount='" + FavoriteCount + '\'' + + ", CommunityOwnedDate='" + CommunityOwnedDate + '\'' + + ", ContentLicense='" + ContentLicense + '\'' + + '}'; + } + + public String getId() { + return Id; + } + + public String getPostTypeId() { + return PostTypeId; + } + + public String getAcceptedAnswerId() { + return AcceptedAnswerId; + } + + public String getCreationDate() { + return CreationDate; + } + + public String getScore() { + return Score; + } + + public String getViewCount() { + return ViewCount; + } + + public String getBody() { + return Body; + } + + public String getOwnerUserId() { + return OwnerUserId; + } + + public String getLastEditorUserId() { + return LastEditorUserId; + } + + public String getLastEditorDisplayName() { + return LastEditorDisplayName; + } + + public String getLastEditDate() { + return LastEditDate; + } + + public String getTitle() { + return Title; + } + + public String getTags() { + return Tags; + } + + public String getAnswerCount() { + return AnswerCount; + } + + public String getCommentCount() { + return CommentCount; + } + + public String getFavoriteCount() { + return FavoriteCount; + } + + public String getCommunityOwnedDate() { + return CommunityOwnedDate; + } + + public String getContentLicense() { + return ContentLicense; + } +} + +public class PostsHandler extends DefaultHandler { + private static final String ROW = "row"; + private static final String POSTS = "posts"; + private StringBuilder elementValue; + + @Override + public void startElement(String uri, String lName, String qName, Attributes attr) throws SAXException { + if (qName.equals(ROW)) { + PostRow postRow = new PostRow( + attr.getValue("Id"), + attr.getValue("PostTypeId"), + attr.getValue("AcceptedAnswerId"), + attr.getValue("CreationDate"), + attr.getValue("Score"), + attr.getValue("ViewCount"), + attr.getValue("Body"), + attr.getValue("OwnerUserId"), + attr.getValue("LastEditorUserId"), + attr.getValue("LastEditorDisplayName"), + attr.getValue("LastEditDate"), + attr.getValue("Title"), + attr.getValue("Tags"), + attr.getValue("AnswerCount"), + attr.getValue("CommentCount"), + attr.getValue("FavoriteCount"), + attr.getValue("CommunityOwnedDate"), + attr.getValue("ContentLicense") + ); + System.out.println(postRow); + + } else if (!qName.equals(POSTS)) { + throw new SAXException(String.format("Unknown tag %s", qName)); + } + + } + + public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException { + final long start = System.currentTimeMillis(); + + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING,false); + SAXParser saxParser = factory.newSAXParser(); + PostsHandler handler = new PostsHandler(); + saxParser.parse( + "..\\data\\raw\\stackoverflow.com-Posts\\posts.xml", handler); + + + // time end + final long durationInMilliseconds = System.currentTimeMillis()-start; + System.out.println("SAX parse took " + durationInMilliseconds + "ms."); + } +} diff --git a/data-collection/SQLiteSession.java b/data-collection/SQLiteSession.java new file mode 100644 index 0000000000000000000000000000000000000000..313845e45c197ce0b8e2f471fa17e9628adeb6f6 --- /dev/null +++ b/data-collection/SQLiteSession.java @@ -0,0 +1,40 @@ +import org.apache.logging.log4j.Level; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.core.config.Configurator; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; + +public class SQLiteSession { + private final Logger logger = LogManager.getLogger(SQLiteSession.class); + private Connection conn; + public SQLiteSession(String url) { + Configurator.setLevel(logger.getName(), Level.INFO); + try { + // create a connection to the database + conn = DriverManager.getConnection(url); + logger.info("Connection to SQLite has been established."); + } catch (SQLException e) { + logger.error(e.getMessage()); + } + } + + public void endSession() { + try { + if (conn != null) { + conn.close(); + } + } catch (SQLException ex) { + logger.error(ex.getMessage()); + } + } + + public static void main(String[] args) { + SQLiteSession s = new SQLiteSession( + "jdbc:sqlite:C:\\Users\\lhb1g20\\OneDrive - University of Southampton\\graph4stackoverflow\\data-collection\\stackoverflow.db" + ); + s.endSession(); + } +} \ No newline at end of file diff --git a/data-collection/TagsHandler.java b/data-collection/TagsHandler.java new file mode 100644 index 0000000000000000000000000000000000000000..02188aee3c67ec56ecaecb1bd073d6e41ad2ba9b --- /dev/null +++ b/data-collection/TagsHandler.java @@ -0,0 +1,2 @@ +public class TagsHandler { +} diff --git a/data-collection/UsersHandler.java b/data-collection/UsersHandler.java new file mode 100644 index 0000000000000000000000000000000000000000..2714e4b21bc388806be3b1104d12af14becd56f7 --- /dev/null +++ b/data-collection/UsersHandler.java @@ -0,0 +1,2 @@ +public class UsersHandler { +} diff --git a/data-collection/VotesHandler.java b/data-collection/VotesHandler.java new file mode 100644 index 0000000000000000000000000000000000000000..89a990d8762b81ff13503166fede911fc4a4bf4f --- /dev/null +++ b/data-collection/VotesHandler.java @@ -0,0 +1,2 @@ +public class VotesHandler { +} diff --git a/data-collection/sqlite-jdbc-3.39.3.0.jar b/data-collection/sqlite-jdbc-3.39.3.0.jar new file mode 100644 index 0000000000000000000000000000000000000000..23ab07358494cdcad77722f4b08748c1442b512a Binary files /dev/null and b/data-collection/sqlite-jdbc-3.39.3.0.jar differ diff --git a/data-collection/stackoverflow.db b/data-collection/stackoverflow.db new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391