diff --git a/comp3217_lab2.ipynb b/comp3217_lab2.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..0895a7e92be4931cf4216a3c6b8c2c8ba32cef41 --- /dev/null +++ b/comp3217_lab2.ipynb @@ -0,0 +1,818 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "collapsed_sections": [ + "1qL5ZRt8-Pp0", + "7skwZBwG_t3j", + "_0FW4qf8JbE1", + "ktrKGm8wKcmp" + ], + "private_outputs": true, + "cell_execution_strategy": "setup" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# COMP3217 - Lab 2 – Machine Learning\n", + "\n", + "Welcome to this Lab.\n", + "\n", + "In this lab you will learn and try machine learning concepts. The lab consists of a few tasks, which are designed to be carried out as exercises. The goal is that by doing these tasks you will get to know how to use machine learning methods to identify security issues in cyber physical systems and for doing your course work. We will try some machine learning methods and evaluate them on simple datasets.\n", + "\n", + "\n", + "### Submission Instructions\n", + "**There is no assessment for the lab and no marks**" + ], + "metadata": { + "id": "WdiAc8Re4nZf" + } + }, + { + "cell_type": "code", + "source": [ + "# Import the required libraries\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import sklearn.datasets\n", + "from sklearn import neighbors, svm\n", + "from sklearn.linear_model import LogisticRegression, LinearRegression\n", + "from sklearn.inspection import DecisionBoundaryDisplay\n", + "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, mean_squared_error, r2_score, accuracy_score\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.model_selection import GridSearchCV, train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "\n", + "import numpy as np\n", + "import pandas as pd" + ], + "metadata": { + "id": "OT66i9GY4izV" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Exercise 1: Working with the Logistic Regression Classifier\n", + "**Learning Outcome: How to use load a dataset, use a classifier in Scikit, plotting decision boundaries, changing hyperparameters.**\n", + "\n" + ], + "metadata": { + "id": "6F4Lvco_4AfP" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "\n", + "\n", + "### Introduction\n", + "Import the Iris Dataset from sklearn.datasets.\n", + "\n", + "This data sets consists of 3 different types of irises’ (Setosa, Versicolour, and Virginica). It is stored in an array of the shape (150 x 4). There are 150 samples (rows) and four columns representing the 4 features: Sepal Length, Sepal Width, Petal Length and Petal Width.\n", + "\n", + "The below code uses the first two features (Sepal Length and Sepal Width). See [here](https://en.wikipedia.org/wiki/Iris_flower_data_set) for more information on this dataset." + ], + "metadata": { + "id": "yqeSFdQU55JS" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kVpHqFog2m3y" + }, + "outputs": [], + "source": [ + "#Taken from Scikit\n", + "\n", + "# import some data from a predefined datatset\n", + "iris = sklearn.datasets.load_iris()\n", + "\n", + "X = iris.data[:, :2] # we only take the first two features, Sepal Length and Sepal Width\n", + "Y = iris.target # we also need the classes that each sample should belong to.\n", + "\n", + "#print shape of the array for X and Y. Also get value of targets\n", + "print(\"Dataset Shape: \", X.shape)\n", + "print(\"Target Shape: \", Y.shape)\n", + "print(\"Targets: \", list(Y))" + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "\n", + "\n", + "### Exercise 1a\n", + "This next part uses the Logistic Regression to classify each sample into one of the three classes in the Iris Dataset (Setosa, Versicolour, and Virginica) based on their Sepal Length and Width." + ], + "metadata": { + "id": "JnDuuZAf86Ya" + } + }, + { + "cell_type": "code", + "source": [ + "\n", + "# Create an instance of Logistic Regression Classifier and fit the data.\n", + "logreg = LogisticRegression(C=1)\n", + "logreg.fit(X, Y)\n", + "\n", + "# Visualise and show the decision boundries\n", + "_, ax = plt.subplots(figsize=(7, 6))\n", + "DecisionBoundaryDisplay.from_estimator(\n", + " logreg,\n", + " X,\n", + " cmap=plt.cm.Paired,\n", + " ax=ax,\n", + " response_method=\"auto\",\n", + " plot_method=\"pcolormesh\",\n", + " shading=\"auto\",\n", + " xlabel=\"Sepal length\",\n", + " ylabel=\"Sepal width\",\n", + " eps=0.5,\n", + ")\n", + "# Plot the training points\n", + "plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors=\"k\", cmap=plt.cm.Paired)\n", + "plt.xticks(())\n", + "plt.yticks(())\n", + "plt.title(\"Exercise 1a: C = 1\")\n", + "plt.show()" + ], + "metadata": { + "id": "ppqR8_6C8MKi" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "\n", + "\n", + "### Exercise 1b\n", + "Observe in the above plot the misclassifications of the data.\n", + "\n", + "In this next cell, try changing the LogisticRegression hyperparameter `C` to `0.001` and run the cell. Do you see any change?" + ], + "metadata": { + "id": "1qL5ZRt8-Pp0" + } + }, + { + "cell_type": "code", + "source": [ + "# Create an instance of Logistic Regression Classifier and fit the data.\n", + "logreg = LogisticRegression(C=0.001)\n", + "logreg.fit(X, Y)\n", + "\n", + "# Visualise and show the decision boundries\n", + "_, ax = plt.subplots(figsize=(7, 6))\n", + "DecisionBoundaryDisplay.from_estimator(\n", + " logreg,\n", + " X,\n", + " cmap=plt.cm.Paired,\n", + " ax=ax,\n", + " response_method=\"auto\",\n", + " plot_method=\"pcolormesh\",\n", + " shading=\"auto\",\n", + " xlabel=\"Sepal length\",\n", + " ylabel=\"Sepal width\",\n", + " eps=0.5,\n", + ")\n", + "# Plot the training points\n", + "plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors=\"k\", cmap=plt.cm.Paired)\n", + "plt.xticks(())\n", + "plt.yticks(())\n", + "plt.title(\"Exercise 1b: C = 0.001\")\n", + "plt.show()" + ], + "metadata": { + "id": "ucbCCJmp-dDq" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "\n", + "\n", + "### Exercise 1c\n", + "In this next cell. try changing the hyperparameter `C` to `1e5` and run the cell.\n", + "\n", + "Compare the plot to those from Exercise 1a and 1b. Observe the misclassifications. Do you see any change?" + ], + "metadata": { + "id": "7skwZBwG_t3j" + } + }, + { + "cell_type": "code", + "source": [ + "# Create an instance of Logistic Regression Classifier and fit the data.\n", + "logreg = LogisticRegression(C=1e5)\n", + "logreg.fit(X, Y)\n", + "\n", + "# Visualise and show the decision boundries\n", + "_, ax = plt.subplots(figsize=(7, 6))\n", + "DecisionBoundaryDisplay.from_estimator(\n", + " logreg,\n", + " X,\n", + " cmap=plt.cm.Paired,\n", + " ax=ax,\n", + " response_method=\"auto\",\n", + " plot_method=\"pcolormesh\",\n", + " shading=\"auto\",\n", + " xlabel=\"Sepal length\",\n", + " ylabel=\"Sepal width\",\n", + " eps=0.5,\n", + ")\n", + "# Plot the training points\n", + "plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors=\"k\", cmap=plt.cm.Paired)\n", + "plt.xticks(())\n", + "plt.yticks(())\n", + "plt.title(\"Exercise 1c: C = 1e5\")\n", + "plt.show()" + ], + "metadata": { + "id": "5sU3NIJi_rSq" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "\n", + "\n", + "Hyperparameters to the classifier plays an important role in defining the decision boundaries of a classifier and hence the results.\n", + "\n", + "**Choose these hyperparameters carefully.**\n", + "\n", + "---\n", + "\n" + ], + "metadata": { + "id": "YO07PevnBH77" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Exercise 2: Working with Dimensionality Reduction and the Linear Logistic Classifier\n", + "**Learning Outcome: How to evaluate LR classifier, how to change training/testing split, display confusion matrix, obtain F1 score and accuracy of the classifier**" + ], + "metadata": { + "id": "GgXXv02kBm_n" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "\n", + "\n", + "### Introduction\n", + "\n", + "For this Exercise we will be using sklearn's Digits dataset, a copy of the test set of the UCI ML hand-written digits datasets https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits\n", + "\n", + "The dataset contains 1797 samples of handwritten digits. There are 10 classes, where each class refers to a digit." + ], + "metadata": { + "id": "83t48kTLCaxS" + } + }, + { + "cell_type": "code", + "source": [ + "# load and normalise the digits dataset\n", + "X_digits, y_digits = sklearn.datasets.load_digits(return_X_y=True) # load the digits dataset, retrieving the data (X_digits) and their labels (Y_digits)\n", + "X_digits = X_digits / X_digits.max() # Normalise the dataset\n", + "\n", + "n_samples = len(X_digits) # save the number of samples in the dataset\n", + "\n", + "print(\"Dataset Shape: \", X_digits.shape)\n", + "print(\"Target Shape: \", y_digits.shape)\n", + "print(\"Targets: \", y_digits)\n", + "print(f\"There are {n_samples} samples in the dataset\")" + ], + "metadata": { + "id": "XsD0NzaYCoKA" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "\n", + "\n", + "### Exercise 2a\n", + "\n", + "This cell will visualise the accuracy of the Logistic Regression classifier on the Digit dataset. It has a ratio of `0.9`, meaning 90% of the data is used for training and 10% is used for testing.\n", + "\n", + "Observe the **accuracy** and **F1 score** of the data. Check the confusion matrix. Observe the misclassifications of the test data." + ], + "metadata": { + "id": "4OyKoer7EdUI" + } + }, + { + "cell_type": "code", + "source": [ + "RATIO = 0.9\n", + "\n", + "# copy 90% of the data from the dataset to the training dataset\n", + "X_train = X_digits[: int(RATIO * n_samples)]\n", + "y_train = y_digits[: int(RATIO * n_samples)]\n", + "print(f\"There are {len(X_train)} samples in the training set\")\n", + "\n", + "# copy 10% of the data from the dataset to the testing dataset\n", + "X_test = X_digits[int(RATIO * n_samples) :]\n", + "y_test = y_digits[int(RATIO * n_samples) :]\n", + "print(f\"There are {len(X_test)} samples in the testing set\")\n", + "\n", + "# train the logistic regression model using our training set\n", + "logreg = LogisticRegression(max_iter=1000)\n", + "trained = logreg.fit(X_train, y_train)\n", + "\n", + "# test the logistic regression model using our testing set\n", + "score = trained.score(X_test, y_test)\n", + "print(f\"Accuracy: {score} ({score*100:.2f}%)\")\n", + "\n", + "# Get predicted labels in our testing set and get the F1 score\n", + "predictions = logreg.predict(X_test)\n", + "f1 = f1_score(y_test, predictions, average='macro')\n", + "print(f\"F1 score: {f1} ({f1*100:.2f}%)\")\n", + "\n", + "# use predicted labels to visualise confusion matrix\n", + "cm = confusion_matrix(y_test, predictions, labels=logreg.classes_)\n", + "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=logreg.classes_)\n", + "disp.plot()\n", + "plt.title(f\"Exercise 2a: Ratio {RATIO}\")\n", + "plt.show()" + ], + "metadata": { + "id": "FMT7xG2xFL60" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "\n", + "\n", + "### Exercise 2b\n", + "\n", + "In this next cell, change the dataset split ratio to 0.8.\n", + "\n", + "Observe the **accuracy** and **F1 score** of the data again. Do they go up or down? Why does it go up/down? Compare this confusion matrix to that of Exercise 2a. Observe the misclassifications. Do you see any change?" + ], + "metadata": { + "id": "_0FW4qf8JbE1" + } + }, + { + "cell_type": "code", + "source": [ + "RATIO = 0.9 # change me!\n", + "\n", + "# copy RATIO of the data from the dataset to the training dataset\n", + "X_train = X_digits[: int(RATIO * n_samples)]\n", + "y_train = y_digits[: int(RATIO * n_samples)]\n", + "print(f\"There are {len(X_train)} samples in the training set\")\n", + "\n", + "# copy RATIO of the data from the dataset to the testing dataset\n", + "X_test = X_digits[int(RATIO * n_samples) :]\n", + "y_test = y_digits[int(RATIO * n_samples) :]\n", + "print(f\"There are {len(X_test)} samples in the testing set\")\n", + "\n", + "# train the logistic regression model using our training set\n", + "logreg = LogisticRegression(max_iter=1000)\n", + "trained = logreg.fit(X_train, y_train)\n", + "\n", + "# test the logistic regression model using our testing set\n", + "score = trained.score(X_test, y_test)\n", + "print(f\"Accuracy: {score} ({score*100:.2f}%)\")\n", + "\n", + "# Get predicted labels in our testing set and get the F1 score\n", + "predictions = logreg.predict(X_test)\n", + "f1 = f1_score(y_test, predictions, average='macro')\n", + "print(f\"F1 score: {f1} ({f1*100:.2f}%)\")\n", + "\n", + "# use predicted labels to visualise confusion matrix\n", + "cm = confusion_matrix(y_test, predictions, labels=logreg.classes_)\n", + "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=logreg.classes_)\n", + "disp.plot()\n", + "plt.title(f\"Exercise 2b: Ratio {RATIO}\")\n", + "plt.show()" + ], + "metadata": { + "id": "9WGbRRniJ8P4" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "\n", + "The percentage of data used for training and testing the data has an impact on the performance of classifier.\n", + "\n", + "The confusion matrix can reveal more insights into the performance than accuracy alone. F1 score helps when data is imbalanced between classes.\n", + "\n", + "\n", + "---\n", + "\n" + ], + "metadata": { + "id": "5kxcfTWbKWzt" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Exercise 3: Working with Dimensionality Reduction and the Linear Logistic Classifier\n", + "\n", + "**Learning Outcome: How to normalize the data, find best hyperparameters, how to use PCA, and evaluate classifier using PCA components**\n" + ], + "metadata": { + "id": "ktrKGm8wKcmp" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "\n", + "### Exercise 3a\n", + "In this exercise we will be using the Logistic Regression Classifier, Principal Component Analysis (PCA) and a Grid Search to find the optimal hyperparameter `C` for Logistic Regression, along with the optimal number of components for PCA." + ], + "metadata": { + "id": "w8e93G8nMFRa" + } + }, + { + "cell_type": "markdown", + "source": [ + "Firstly we need to define and intialise the object needed for the task: StandardScaler (to normalise the input data), PCA and LogisticRegression.\n", + "\n", + "Then we define our hyperparameter grid. Here we use a list of n_components hyperparameters for the PCA: 5, 15, 30, 45 and 60 components. We also create a numpy.logspace; a list of values evenly spaced on a logarithmic scale, to use as our C hyperparameter.\n", + "\n", + "Finally, using a Pipeline from sklearn.model_selection, our three steps (Normalisation, PCA and Regression) and our hyperparameter grid to automate a grid search that finds the best hyperparameters for our model." + ], + "metadata": { + "id": "2JVBXRB6OFCU" + } + }, + { + "cell_type": "code", + "source": [ + "# load the data we are going to use\n", + "X_digits, y_digits = sklearn.datasets.load_digits(return_X_y=True)\n", + "\n", + "# define and initialise the objects needed for the exercise\n", + "pca = PCA()\n", + "scaler = StandardScaler()\n", + "logreg = LogisticRegression(max_iter=10000, tol=0.1)\n", + "\n", + "# create a pipeline with three sequential steps normalisation, PCA and logistic regression\n", + "pipeline = Pipeline(steps=[\n", + " (\"scaler\", scaler),\n", + " (\"pca\", pca),\n", + " (\"logreg\", logreg)\n", + " ])\n", + "\n", + "# create a search grid\n", + "max_C = 10\n", + "min_C = 0.0001\n", + "num_Cs = 10\n", + "\n", + "hyperparameter_grid = {\n", + " \"pca__n_components\": [5, 15, 30, 45, 60],\n", + " \"logreg__C\" : np.logspace(np.log10(min_C), np.log10(max_C), num=num_Cs),\n", + "}\n", + "\n", + "# create and run a grid search\n", + "search = GridSearchCV(pipeline, hyperparameter_grid, n_jobs=2, cv=5)\n", + "search.fit(X_digits, y_digits) # run the search, may take a while\n", + "\n", + "# print the best hyperparameters found by the search\n", + "print(\"Best C:\", search.best_params_[\"logreg__C\"])\n", + "print(\"Best n_components:\", search.best_params_[\"pca__n_components\"])\n", + "\n", + "\n" + ], + "metadata": { + "id": "ZA27rI3zNXip" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "This next code block plots classification accuracy based on the n_components hyperparameter. The goal is to maximise classification accuracy while minimising variance." + ], + "metadata": { + "id": "_oYmOh1WWw7K" + } + }, + { + "cell_type": "code", + "source": [ + "\n", + "pca.fit(X_digits)\n", + "fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(8,8))\n", + "ax0.plot(\n", + " np.arange(1, pca.n_components_ + 1), pca.explained_variance_ratio_, \"+\", linewidth=2\n", + ")\n", + "\n", + "ax0.set_ylabel(\"PCA explained variance ratio\")\n", + "ax0.axvline(\n", + " search.best_estimator_.named_steps[\"pca\"].n_components,\n", + " linestyle=\":\",\n", + " label=\"n_components chosen\",\n", + ")\n", + "ax0.legend(prop=dict(size=12))\n", + "\n", + "# For each number of components, find the best classifier results\n", + "results = pd.DataFrame(search.cv_results_)\n", + "components_col = \"param_pca__n_components\"\n", + "best_clfs = results.groupby(components_col).apply(\n", + " lambda g: g.nlargest(1, \"mean_test_score\")\n", + ")\n", + "\n", + "best_clfs.plot(\n", + " x=components_col, y=\"mean_test_score\", yerr=\"std_test_score\", legend=False, ax=ax1\n", + ")\n", + "ax1.set_ylabel(\"Classification accuracy (val)\")\n", + "ax1.set_xlabel(\"n_components\")\n", + "plt.tight_layout()\n", + "plt.show()\n" + ], + "metadata": { + "id": "dNKk9fLFUR9I" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "Number of PCA components has an impact on the performance of classifier.\n", + "\n", + "Grid Search with cross validation can be used to find the best hyperparameters for a classifier.\n", + "\n", + "\n", + "---\n", + "\n", + "\n" + ], + "metadata": { + "id": "ZiL_VzgSXzyW" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Exercise 4: Introducing Linear Regression\n", + "**Learning Outcome: Learning to use scikit learn's Linear Regression**\n", + "\n", + "For this Exercise we will be using the Diabetes dataset from sklearn. More information can be found [here](https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset).\n" + ], + "metadata": { + "id": "wfV5W-zRX8CA" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "\n", + "\n", + "### Exercise 4a\n", + "The following cell will isolate a single feature from the Diabetes dataset (`FEATURE_INDEX`) and split the dataset into a train/test split with 20 testing samples.\n", + "\n", + "The cell shows how linear regression attempts to draw a straight line that best minimises the residual sum of squares between the observed responses in the dataset, and the responses predicted by the linear approximation.\n", + "\n", + "Observe the Mean Square Error (MSE) and Coefficient of determination for few samples of Diabetes dataset" + ], + "metadata": { + "id": "1kWU_zaAd-WH" + } + }, + { + "cell_type": "code", + "source": [ + "def linear_regression(feature_index=2):\n", + " X_diabetes, y_diabetes = sklearn.datasets.load_diabetes(return_X_y=True)\n", + " print(\"Dataset Shape:\", X_diabetes.shape)\n", + " print(\"Target Shape:\", y_diabetes.shape)\n", + " print(\"Targets:\", list(y_diabetes))\n", + " print(f\"There are {len(X_diabetes)} samples in the dataset\")\n", + "\n", + " # we are only using one feature, so isolate that feature\n", + " X_diabetes = X_diabetes[:, np.newaxis, feature_index]\n", + " print(f\"After removing all but feature index {feature_index}, the dataset has the shape {X_diabetes.shape}\")\n", + "\n", + " # split the dataset into training and testing, allowing only 20 samples for testing\n", + " print(\"\\n\" + \"-\"*120) # separator\n", + " NUM_TESTING_SAMPLES = 20\n", + "\n", + " # split input data\n", + " X_train_diabetes = X_diabetes[:-NUM_TESTING_SAMPLES]\n", + " X_test_diabetes = X_diabetes[-NUM_TESTING_SAMPLES:]\n", + "\n", + " # split targets\n", + " y_train_diabetes = y_diabetes[:-NUM_TESTING_SAMPLES]\n", + " y_test_diabetes = y_diabetes[-NUM_TESTING_SAMPLES:]\n", + "\n", + " print('Training Input Data Shape:', X_train_diabetes.shape)\n", + " print(\"Training Target Data Shape:\", y_train_diabetes.shape)\n", + " print(\"Testing Input Data Shape:\", X_test_diabetes.shape)\n", + " print(\"Testing Target Data Shape:\", y_test_diabetes.shape)\n", + "\n", + " # fit the linear regression model to the training data\n", + " print(\"\\n\" + \"-\"*120) # separator\n", + " linreg = LinearRegression()\n", + " linreg.fit(X_train_diabetes, y_train_diabetes)\n", + " y_pred_diabetes = linreg.predict(X_test_diabetes)\n", + "\n", + " print(\"Coefficents:\", linreg.coef_)\n", + " print(\"MSE: %.2f\" % mean_squared_error(y_test_diabetes, y_pred_diabetes))\n", + " print(\"Coefficient of determination: %.2f\" % r2_score(y_test_diabetes, y_pred_diabetes))\n", + " print(\"-\"*120 + \"\\n\")\n", + " # plot\n", + " plt.scatter(X_test_diabetes, y_test_diabetes, color=\"black\")# plots the ground truth labels\n", + " plt.scatter(X_test_diabetes, y_pred_diabetes, color=\"red\") # plots the predicted test labels\n", + " plt.plot(X_test_diabetes, y_pred_diabetes, color=\"blue\", linewidth=3) # plots the predicted line drawn by the linear regression model\n", + " plt.xticks(())\n", + " plt.yticks(())\n", + " plt.title(f\"Exercise 4: Feature Index {FEATURE_INDEX}\")\n", + " plt.show()\n", + "\n", + "linear_regression(feature_index=2)" + ], + "metadata": { + "id": "bom0EcXqY0cF" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "\n", + "\n", + "### Exercise 4b\n", + "\n", + "In the below cell, change the feature_index to 0. Run the cell. Compare the plot produced to that of Exercise 4a. Observe the MSE and Coefficent of determination. Do you see any changes?" + ], + "metadata": { + "id": "MxUS_7nCh7xU" + } + }, + { + "cell_type": "code", + "source": [ + "linear_regression(feature_index=2) # change me" + ], + "metadata": { + "id": "l9j1IfGFhbKw" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "Features play an important role in the performance of the machine learning.\n", + "\n", + "You can always try other regression methods.\n", + "\n", + "\n", + "---\n", + "\n", + "\n" + ], + "metadata": { + "id": "WE7jeQtpj5X4" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Exercise 5: Introducing Support Vector Machine (SVM)\n", + "**Learning Outcome: Learning to use ScikitLearn's SVM**\n", + "\n", + "For this Exercise we will be using the Breast Cancer dataset from sklearn. More information is available [here](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html#sklearn.datasets.load_breast_cancer).\n", + "\n", + "The following cell shows how to use sklearn's SVM classifier (SVC). Refer to the API reference [here](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC) for help with parameters and hyperparameters." + ], + "metadata": { + "id": "WLpZtAdGjzgy" + } + }, + { + "cell_type": "code", + "source": [ + "cancer = sklearn.datasets.load_breast_cancer() # load the breast cancer dataset from sklearn.\n", + "print(f\"Dataset has {len(cancer.feature_names)} features, named: {list(cancer.feature_names)}\")\n", + "print(f\"There are {len(cancer.target_names)} targets, labeled: {list(cancer.target_names)}\")\n", + "print(\"Dataset Shape:\", cancer.data.shape)\n", + "\n", + "print(\"\\n\" + \"-\"*120) # separator\n", + "\n", + "# split the dataset into training and testing sets using sklearn.model_selection.train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2) # 80/20 train/test split\n", + "print(\"Training Input Data Shape:\", X_train.shape)\n", + "print(\"Training Target Data Shape:\", y_train.shape)\n", + "print(\"Testing Input Data Shape:\", X_test.shape)\n", + "print(\"Testing Target Data Shape:\", y_test.shape)\n", + "\n", + "print(\"\\n\" + \"-\"*120) # separator\n", + "\n", + "model = svm.SVC(kernel='linear') # use linear kernel\n", + "model.fit(X_train, y_train) # train the svm\n", + "y_pred = model.predict(X_test)\n", + "accuracy = accuracy_score(y_test, y_pred) # find accuracy by comparing predicted labels with ground truth labels\n", + "print(f\"Accuracy: {accuracy} ({accuracy*100:.2f}%)\")" + ], + "metadata": { + "id": "zq2K29uhlA6x" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "For you to try:\n", + "1. Evaluate non-linear kernels such as RBF and polynomial on the dataset and compare the accuracy. Use the default hyperparameters\n", + "2. Do a grid search on the SVM's hyperparameters and find the best ones for this model. Compare the accuracies.\n", + "\n" + ], + "metadata": { + "id": "dN91_Z92n8b0" + } + } + ] +} \ No newline at end of file