Update Untitled6.ipynb

6e57b0e9 · yl3r22 · f53a09ad · 6e57b0e9
Commit 6e57b0e9 authored Jun 8, 2023 by yl3r22
--- a/Untitled6.ipynb
+++ b/Untitled6.ipynb
@@ -35,11 +35,13 @@
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
+    "#Read data from a CSV file and parse it into a Pandas DataFrame\n",
+    "#header is an optional parameter specifying whether the CSV file contains a header line. In this task, all data needs to be loaded as content and no header line is needed\n",
    "df = pd.read_csv('C:\\\\Users\\yl3r22\\Downloads\\TrainingDataBinary.csv', header= None)\n",
    "df1 = pd.read_csv('C:\\\\Users\\yl3r22\\Downloads\\TestingDataBinary.csv', header= None)\n",
    "\n",
    "print(df.head(1))\n",
-    "\n",
+    "#Store the first 128 columns in data frame df as feature data in the 'df_feature ' variable and store the 128th column as label data in the 'df_label ' variable\n",
    "df_feature = df.iloc[:, :128]\n",
    "df_label = df.iloc[:, 128]\n",
    "cancer = datasets.load_breast_cancer()\n",
@@ -93,6 +95,7 @@
    }
   ],
   "source": [
+    "#Use a classifier named 'clf' to make predictions on data named 'df1' and store the results in a variable named y_result\n",
    "y_result = clf.predict(df1)\n",
    "print(y_result)"
   ]
@@ -146,6 +149,7 @@
    }
   ],
   "source": [
+    "#Use cross-validation to evaluate the performance of the classifier 'clf1 ' on the training dataset 'X_train' and the corresponding target variable 'y_train'\n",
    "scores0 = cross_val_score(clf1, X_train, y_train, cv = 6)\n",
    "\n",
    "print(\"Scores:\", scores0)\n",

 %% Cell type:code id:314edca9 tags:

 ``` python
 import pandas as pd
 import numpy as np

 from sklearn import datasets
 from sklearn.model_selection import train_test_split
 from sklearn import svm, metrics
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import accuracy_score
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import cross_val_score

+#Read data from a CSV file and parse it into a Pandas DataFrame
+#header is an optional parameter specifying whether the CSV file contains a header line. In this task, all data needs to be loaded as content and no header line is needed
 df = pd.read_csv('C:\\Users\yl3r22\Downloads\TrainingDataBinary.csv', header= None)
 df1 = pd.read_csv('C:\\Users\yl3r22\Downloads\TestingDataBinary.csv', header= None)

 print(df.head(1))
-
+#Store the first 128 columns in data frame df as feature data in the 'df_feature ' variable and store the 128th column as label data in the 'df_label ' variable
 df_feature = df.iloc[:, :128]
 df_label = df.iloc[:, 128]
 cancer = datasets.load_breast_cancer()

 # Split dataset into training set and test set
 X_train, X_test, y_train, y_test = train_test_split(df_feature, df_label, test_size=0.2)

 ```

 %% Output

             0            1          2            3           4            5    \
    0  70.399324  127673.0908 -49.572308  127648.0176 -169.578319  127723.2374
    
             6          7          8          9    ...  119  120  121  122  123  \
    0  65.689611  605.91099 -57.003571  626.78553  ...    0    0    0    0    0
    
       124  125  126  127  128
    0    0    0    0    0    0
    
    [1 rows x 129 columns]

 %% Cell type:code id:e614c178 tags:

 ``` python
 #Create a svm Classifier
 clf = svm.SVC(kernel='linear') # Linear Kernel

 #Train the model using the training sets
 clf.fit(X_train, y_train)

 #Predict the response for test dataset
 y_pred = clf.predict(X_test)

 print("TrainingAccuracy:",metrics.accuracy_score(y_test, y_pred))
 ```

 %% Output

    TrainingAccuracy: 0.8775

 %% Cell type:code id:4d4d4c52 tags:

 ``` python
+#Use a classifier named 'clf' to make predictions on data named 'df1' and store the results in a variable named y_result
 y_result = clf.predict(df1)
 print(y_result)
 ```

 %% Output

    [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0
     0 0 1 1 1 1 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0
     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

 %% Cell type:code id:f8c31855 tags:

 ``` python
 #Create a svm Classifier
 clf1 = RandomForestClassifier(n_estimators=100, max_features=70)# Linear Kernel

 #Train the model using the training sets
 clf1.fit(X_train, y_train)

 #Predict the response for test dataset
 y_pred = clf1.predict(X_test)

 print("TrainingAccuracy:",metrics.accuracy_score(y_test, y_pred))
 ```

 %% Output

    TrainingAccuracy: 0.985

 %% Cell type:markdown id:31fc5eaa tags:


 %% Cell type:code id:95729aae tags:

 ``` python
+#Use cross-validation to evaluate the performance of the classifier 'clf1 ' on the training dataset 'X_train' and the corresponding target variable 'y_train'
 scores0 = cross_val_score(clf1, X_train, y_train, cv = 6)

 print("Scores:", scores0)
 print("MeanScores:", np.mean(scores0))
 ```

 %% Output

    Scores: [0.97625 0.97625 0.98375 0.98125 0.97625 0.97875]
    MeanScores: 0.9787499999999999

 %% Cell type:code id:65a2e2ed tags:

 ``` python
 test_data=pd.read_csv("C:\\Users\yl3r22\Downloads\TestingDataBinary.csv",header=None)

 # Predict testing datasets at first
 # Use previous model - df1

 predictions = clf1.predict(test_data)

 # Convert data frame
 prediction_df = pd.DataFrame(predictions)

 result = pd.concat([test_data,prediction_df],axis=1)

 # output the csv documentary

 result.to_csv('C:\\Users\yl3r22\Downloads\TestingResultsBinary.csv',index = False, header = False)
 ```

 %% Cell type:code id:1251c20e tags:

 ``` python
 # Check Predictions
 print(predictions)
 ```

 %% Output

    [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
     1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
     1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

 %% Cell type:code id:f2018309 tags:

 ``` python
 ```