diff --git a/feature extraction/Feature description. v1.1 .xlsx b/feature extraction/Feature description. v1.1 .xlsx
new file mode 100644
index 0000000000000000000000000000000000000000..bf4f6e4825741ccc979e28f5993157b58f38f7fc
Binary files /dev/null and b/feature extraction/Feature description. v1.1 .xlsx differ
diff --git a/feature extraction/DataPreprocessing_FeatureEngineering_Pipeline_df1n19.ipynb b/tfn/notebooks/.ipynb_checkpoints/DataPreprocessing_FeatureEngineering_Pipeline_df1n19 v1.1-checkpoint.ipynb
similarity index 94%
rename from feature extraction/DataPreprocessing_FeatureEngineering_Pipeline_df1n19.ipynb
rename to tfn/notebooks/.ipynb_checkpoints/DataPreprocessing_FeatureEngineering_Pipeline_df1n19 v1.1-checkpoint.ipynb
index 300137e3e2d71b93d22426f7063f3e4ac737d4fa..c62f23391cd5d20b286bcdefd0c6157a13f00407 100644
--- a/feature extraction/DataPreprocessing_FeatureEngineering_Pipeline_df1n19.ipynb	
+++ b/tfn/notebooks/.ipynb_checkpoints/DataPreprocessing_FeatureEngineering_Pipeline_df1n19 v1.1-checkpoint.ipynb	
@@ -1278,7 +1278,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "songs = apply_pipeline(songs, feature_pipeline_song)\n"
+    "songs = apply_pipeline(songs, feature_pipeline_song)"
    ]
   },
   {
@@ -1583,23 +1583,6 @@
    "execution_count": 16,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# Standardize numerical data: registration_duration & song_length\n",
-    "\n",
-    "transfer_list = ['song_length','registration_duration']\n",
-    "def standardize(data, transfer_list):\n",
-    "    for i in transfer_list:\n",
-    "        data[i] = preprocessing.scale(data[i], axis=0, with_mean=True, with_std=True, copy=True)\n",
-    "        \n",
-    "standardize(train, transfer_list)\n",
-    "standardize(test, transfer_list)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "# Sparse label data from train dataset\n",
     "\n",
@@ -1613,7 +1596,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -1681,7 +1664,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.420898</td>\n",
+       "      <td>2103</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
@@ -1704,7 +1687,7 @@
        "      <td>13</td>\n",
        "      <td>24.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.596334</td>\n",
+       "      <td>2301</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
@@ -1713,7 +1696,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>4</td>\n",
-       "      <td>-0.303863</td>\n",
+       "      <td>225396.0</td>\n",
        "      <td>2006.0</td>\n",
        "      <td>31</td>\n",
        "      <td>186.0</td>\n",
@@ -1727,7 +1710,7 @@
        "      <td>13</td>\n",
        "      <td>24.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.596334</td>\n",
+       "      <td>2301</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
@@ -1750,7 +1733,7 @@
        "      <td>13</td>\n",
        "      <td>24.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.596334</td>\n",
+       "      <td>2301</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
@@ -1759,7 +1742,7 @@
        "      <td>1</td>\n",
        "      <td>7</td>\n",
        "      <td>7</td>\n",
-       "      <td>-0.872754</td>\n",
+       "      <td>187802.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>2</td>\n",
        "      <td>18.0</td>\n",
@@ -1773,7 +1756,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.420898</td>\n",
+       "      <td>2103</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
@@ -1782,7 +1765,7 @@
        "      <td>1</td>\n",
        "      <td>7</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.035210</td>\n",
+       "      <td>247803.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>31</td>\n",
        "      <td>13.0</td>\n",
@@ -1796,7 +1779,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.420898</td>\n",
+       "      <td>2103</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
@@ -1805,7 +1788,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>4</td>\n",
-       "      <td>-0.234466</td>\n",
+       "      <td>229982.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>109</td>\n",
        "      <td>34.0</td>\n",
@@ -1819,7 +1802,7 @@
        "      <td>13</td>\n",
        "      <td>24.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.596334</td>\n",
+       "      <td>2301</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
@@ -1828,7 +1811,7 @@
        "      <td>1</td>\n",
        "      <td>7</td>\n",
        "      <td>7</td>\n",
-       "      <td>-0.973945</td>\n",
+       "      <td>181115.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>2</td>\n",
        "      <td>39.0</td>\n",
@@ -1842,7 +1825,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.420898</td>\n",
+       "      <td>2103</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
@@ -1851,7 +1834,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.506754</td>\n",
+       "      <td>278964.0</td>\n",
        "      <td>2007.0</td>\n",
        "      <td>58</td>\n",
        "      <td>153.0</td>\n",
@@ -1865,7 +1848,7 @@
        "      <td>15</td>\n",
        "      <td>26.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.603422</td>\n",
+       "      <td>2309</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
@@ -1874,7 +1857,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.179968</td>\n",
+       "      <td>257369.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>109</td>\n",
        "      <td>153.0</td>\n",
@@ -1888,7 +1871,7 @@
        "      <td>15</td>\n",
        "      <td>26.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.603422</td>\n",
+       "      <td>2309</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10</th>\n",
@@ -1897,7 +1880,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.326002</td>\n",
+       "      <td>223933.0</td>\n",
        "      <td>2014.0</td>\n",
        "      <td>109</td>\n",
        "      <td>49.0</td>\n",
@@ -1911,7 +1894,7 @@
        "      <td>15</td>\n",
        "      <td>26.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.603422</td>\n",
+       "      <td>2309</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>11</th>\n",
@@ -1920,7 +1903,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.315954</td>\n",
+       "      <td>224597.0</td>\n",
        "      <td>2015.0</td>\n",
        "      <td>109</td>\n",
        "      <td>79.0</td>\n",
@@ -1934,7 +1917,7 @@
        "      <td>15</td>\n",
        "      <td>26.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.603422</td>\n",
+       "      <td>2309</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>12</th>\n",
@@ -1943,7 +1926,7 @@
        "      <td>3</td>\n",
        "      <td>20</td>\n",
        "      <td>11</td>\n",
-       "      <td>0.523702</td>\n",
+       "      <td>280084.0</td>\n",
        "      <td>2014.0</td>\n",
        "      <td>109</td>\n",
        "      <td>236.0</td>\n",
@@ -1957,7 +1940,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.746018</td>\n",
+       "      <td>786</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>13</th>\n",
@@ -1980,7 +1963,7 @@
        "      <td>5</td>\n",
        "      <td>NaN</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.688482</td>\n",
+       "      <td>2405</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>14</th>\n",
@@ -1989,7 +1972,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>4</td>\n",
-       "      <td>-0.531335</td>\n",
+       "      <td>210364.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>109</td>\n",
        "      <td>236.0</td>\n",
@@ -2003,7 +1986,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.746018</td>\n",
+       "      <td>786</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>15</th>\n",
@@ -2012,7 +1995,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.073426</td>\n",
+       "      <td>240624.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>109</td>\n",
        "      <td>58815.0</td>\n",
@@ -2026,7 +2009,7 @@
        "      <td>5</td>\n",
        "      <td>NaN</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.688482</td>\n",
+       "      <td>2405</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>16</th>\n",
@@ -2035,7 +2018,7 @@
        "      <td>3</td>\n",
        "      <td>9</td>\n",
        "      <td>4</td>\n",
-       "      <td>0.050146</td>\n",
+       "      <td>248790.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>109</td>\n",
        "      <td>13.0</td>\n",
@@ -2049,7 +2032,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.746018</td>\n",
+       "      <td>786</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>17</th>\n",
@@ -2058,7 +2041,7 @@
        "      <td>3</td>\n",
        "      <td>9</td>\n",
        "      <td>4</td>\n",
-       "      <td>0.217920</td>\n",
+       "      <td>259877.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>108</td>\n",
        "      <td>308.0</td>\n",
@@ -2072,7 +2055,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.746018</td>\n",
+       "      <td>786</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>18</th>\n",
@@ -2081,7 +2064,7 @@
        "      <td>3</td>\n",
        "      <td>20</td>\n",
        "      <td>11</td>\n",
-       "      <td>0.306687</td>\n",
+       "      <td>265743.0</td>\n",
        "      <td>2013.0</td>\n",
        "      <td>109</td>\n",
        "      <td>236.0</td>\n",
@@ -2095,7 +2078,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.746018</td>\n",
+       "      <td>786</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>19</th>\n",
@@ -2104,7 +2087,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.052982</td>\n",
+       "      <td>241975.0</td>\n",
        "      <td>2006.0</td>\n",
        "      <td>109</td>\n",
        "      <td>252.0</td>\n",
@@ -2118,7 +2101,7 @@
        "      <td>9</td>\n",
        "      <td>27.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.147940</td>\n",
+       "      <td>1461</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>20</th>\n",
@@ -2127,7 +2110,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.841823</td>\n",
+       "      <td>189846.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>108</td>\n",
        "      <td>10.0</td>\n",
@@ -2141,7 +2124,7 @@
        "      <td>9</td>\n",
        "      <td>27.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.147940</td>\n",
+       "      <td>1461</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>21</th>\n",
@@ -2150,7 +2133,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.036845</td>\n",
+       "      <td>247911.0</td>\n",
        "      <td>2003.0</td>\n",
        "      <td>109</td>\n",
        "      <td>298.0</td>\n",
@@ -2164,7 +2147,7 @@
        "      <td>9</td>\n",
        "      <td>27.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.147940</td>\n",
+       "      <td>1461</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>22</th>\n",
@@ -2187,7 +2170,7 @@
        "      <td>9</td>\n",
        "      <td>27.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.147940</td>\n",
+       "      <td>1461</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>23</th>\n",
@@ -2210,7 +2193,7 @@
        "      <td>15</td>\n",
        "      <td>18.0</td>\n",
        "      <td>4</td>\n",
-       "      <td>-0.866519</td>\n",
+       "      <td>650</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>24</th>\n",
@@ -2233,7 +2216,7 @@
        "      <td>15</td>\n",
        "      <td>18.0</td>\n",
        "      <td>4</td>\n",
-       "      <td>-0.866519</td>\n",
+       "      <td>650</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>25</th>\n",
@@ -2242,7 +2225,7 @@
        "      <td>3</td>\n",
        "      <td>11</td>\n",
        "      <td>7</td>\n",
-       "      <td>-1.728693</td>\n",
+       "      <td>131239.0</td>\n",
        "      <td>2006.0</td>\n",
        "      <td>58</td>\n",
        "      <td>546.0</td>\n",
@@ -2256,7 +2239,7 @@
        "      <td>15</td>\n",
        "      <td>18.0</td>\n",
        "      <td>4</td>\n",
-       "      <td>-0.866519</td>\n",
+       "      <td>650</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>26</th>\n",
@@ -2265,7 +2248,7 @@
        "      <td>3</td>\n",
        "      <td>11</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.934035</td>\n",
+       "      <td>307200.0</td>\n",
        "      <td>1998.0</td>\n",
        "      <td>109</td>\n",
        "      <td>1.0</td>\n",
@@ -2279,7 +2262,7 @@
        "      <td>15</td>\n",
        "      <td>18.0</td>\n",
        "      <td>4</td>\n",
-       "      <td>-0.866519</td>\n",
+       "      <td>650</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>27</th>\n",
@@ -2288,7 +2271,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.477796</td>\n",
+       "      <td>213902.0</td>\n",
        "      <td>2009.0</td>\n",
        "      <td>109</td>\n",
        "      <td>231.0</td>\n",
@@ -2302,7 +2285,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.849741</td>\n",
+       "      <td>2587</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>28</th>\n",
@@ -2311,7 +2294,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.390097</td>\n",
+       "      <td>271255.0</td>\n",
        "      <td>2008.0</td>\n",
        "      <td>109</td>\n",
        "      <td>292.0</td>\n",
@@ -2325,7 +2308,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.849741</td>\n",
+       "      <td>2587</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>29</th>\n",
@@ -2334,7 +2317,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.046983</td>\n",
+       "      <td>248581.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>152</td>\n",
        "      <td>62.0</td>\n",
@@ -2348,7 +2331,7 @@
        "      <td>13</td>\n",
        "      <td>34.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.850627</td>\n",
+       "      <td>2588</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
@@ -2380,7 +2363,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.453351</td>\n",
+       "      <td>275435.0</td>\n",
        "      <td>1996.0</td>\n",
        "      <td>107</td>\n",
        "      <td>162.0</td>\n",
@@ -2394,7 +2377,7 @@
        "      <td>15</td>\n",
        "      <td>27.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.570582</td>\n",
+       "      <td>984</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377389</th>\n",
@@ -2403,7 +2386,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.094363</td>\n",
+       "      <td>251712.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>108</td>\n",
        "      <td>24.0</td>\n",
@@ -2417,7 +2400,7 @@
        "      <td>15</td>\n",
        "      <td>27.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.570582</td>\n",
+       "      <td>984</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377390</th>\n",
@@ -2426,7 +2409,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>4</td>\n",
-       "      <td>0.865757</td>\n",
+       "      <td>302688.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>108</td>\n",
        "      <td>24.0</td>\n",
@@ -2440,7 +2423,7 @@
        "      <td>15</td>\n",
        "      <td>21.0</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.103695</td>\n",
+       "      <td>1745</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377391</th>\n",
@@ -2449,7 +2432,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.347409</td>\n",
+       "      <td>268434.0</td>\n",
        "      <td>2015.0</td>\n",
        "      <td>108</td>\n",
        "      <td>136.0</td>\n",
@@ -2463,7 +2446,7 @@
        "      <td>13</td>\n",
        "      <td>41.0</td>\n",
        "      <td>7</td>\n",
-       "      <td>-0.903733</td>\n",
+       "      <td>608</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377392</th>\n",
@@ -2472,7 +2455,7 @@
        "      <td>7</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.701600</td>\n",
+       "      <td>291840.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>109</td>\n",
        "      <td>103.0</td>\n",
@@ -2486,7 +2469,7 @@
        "      <td>6</td>\n",
        "      <td>23.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>1.417693</td>\n",
+       "      <td>3228</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377393</th>\n",
@@ -2495,7 +2478,7 @@
        "      <td>7</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.053309</td>\n",
+       "      <td>248999.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>109</td>\n",
        "      <td>103.0</td>\n",
@@ -2509,7 +2492,7 @@
        "      <td>6</td>\n",
        "      <td>23.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>1.417693</td>\n",
+       "      <td>3228</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377394</th>\n",
@@ -2518,7 +2501,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.306587</td>\n",
+       "      <td>225216.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>109</td>\n",
        "      <td>58815.0</td>\n",
@@ -2532,7 +2515,7 @@
        "      <td>15</td>\n",
        "      <td>22.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.668103</td>\n",
+       "      <td>2382</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377395</th>\n",
@@ -2541,7 +2524,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.351485</td>\n",
+       "      <td>222249.0</td>\n",
        "      <td>2013.0</td>\n",
        "      <td>108</td>\n",
        "      <td>17.0</td>\n",
@@ -2555,7 +2538,7 @@
        "      <td>15</td>\n",
        "      <td>22.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.668103</td>\n",
+       "      <td>2382</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377396</th>\n",
@@ -2564,7 +2547,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.924184</td>\n",
+       "      <td>306549.0</td>\n",
        "      <td>2007.0</td>\n",
        "      <td>109</td>\n",
        "      <td>148.0</td>\n",
@@ -2578,7 +2561,7 @@
        "      <td>15</td>\n",
        "      <td>22.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.668103</td>\n",
+       "      <td>2382</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377397</th>\n",
@@ -2587,7 +2570,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.561434</td>\n",
+       "      <td>208375.0</td>\n",
        "      <td>2003.0</td>\n",
        "      <td>109</td>\n",
        "      <td>252.0</td>\n",
@@ -2601,7 +2584,7 @@
        "      <td>13</td>\n",
        "      <td>18.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.450966</td>\n",
+       "      <td>1119</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377398</th>\n",
@@ -2624,7 +2607,7 @@
        "      <td>15</td>\n",
        "      <td>27.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>2.547395</td>\n",
+       "      <td>4503</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377399</th>\n",
@@ -2647,7 +2630,7 @@
        "      <td>15</td>\n",
        "      <td>27.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>2.547395</td>\n",
+       "      <td>4503</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377400</th>\n",
@@ -2656,7 +2639,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>4</td>\n",
-       "      <td>1.000436</td>\n",
+       "      <td>311588.0</td>\n",
        "      <td>2014.0</td>\n",
        "      <td>109</td>\n",
        "      <td>308.0</td>\n",
@@ -2670,7 +2653,7 @@
        "      <td>15</td>\n",
        "      <td>27.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>2.547395</td>\n",
+       "      <td>4503</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377401</th>\n",
@@ -2679,7 +2662,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>4</td>\n",
-       "      <td>-0.169290</td>\n",
+       "      <td>234289.0</td>\n",
        "      <td>2013.0</td>\n",
        "      <td>109</td>\n",
        "      <td>133.0</td>\n",
@@ -2693,7 +2676,7 @@
        "      <td>22</td>\n",
        "      <td>29.0</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.007060</td>\n",
+       "      <td>1620</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377402</th>\n",
@@ -2702,7 +2685,7 @@
        "      <td>0</td>\n",
        "      <td>20</td>\n",
        "      <td>10</td>\n",
-       "      <td>-1.199389</td>\n",
+       "      <td>166217.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>41</td>\n",
        "      <td>58.0</td>\n",
@@ -2716,7 +2699,7 @@
        "      <td>22</td>\n",
        "      <td>37.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>2.305506</td>\n",
+       "      <td>4230</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377403</th>\n",
@@ -2725,7 +2708,7 @@
        "      <td>0</td>\n",
        "      <td>20</td>\n",
        "      <td>10</td>\n",
-       "      <td>-0.311944</td>\n",
+       "      <td>224862.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>41</td>\n",
        "      <td>1.0</td>\n",
@@ -2739,7 +2722,7 @@
        "      <td>22</td>\n",
        "      <td>37.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>2.305506</td>\n",
+       "      <td>4230</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377404</th>\n",
@@ -2748,7 +2731,7 @@
        "      <td>0</td>\n",
        "      <td>20</td>\n",
        "      <td>10</td>\n",
-       "      <td>-0.914913</td>\n",
+       "      <td>185016.0</td>\n",
        "      <td>2015.0</td>\n",
        "      <td>41</td>\n",
        "      <td>14.0</td>\n",
@@ -2762,7 +2745,7 @@
        "      <td>22</td>\n",
        "      <td>37.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>2.305506</td>\n",
+       "      <td>4230</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377405</th>\n",
@@ -2771,7 +2754,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.619754</td>\n",
+       "      <td>204521.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>96</td>\n",
        "      <td>58815.0</td>\n",
@@ -2785,7 +2768,7 @@
        "      <td>4</td>\n",
        "      <td>28.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.675191</td>\n",
+       "      <td>2390</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377406</th>\n",
@@ -2794,7 +2777,7 @@
        "      <td>1</td>\n",
        "      <td>11</td>\n",
        "      <td>7</td>\n",
-       "      <td>-0.718312</td>\n",
+       "      <td>198008.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>128</td>\n",
        "      <td>76.0</td>\n",
@@ -2808,7 +2791,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>-0.240089</td>\n",
+       "      <td>1357</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377407</th>\n",
@@ -2817,7 +2800,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.155414</td>\n",
+       "      <td>235206.0</td>\n",
        "      <td>2015.0</td>\n",
        "      <td>108</td>\n",
        "      <td>31.0</td>\n",
@@ -2831,7 +2814,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>-0.225026</td>\n",
+       "      <td>1374</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377408</th>\n",
@@ -2840,7 +2823,7 @@
        "      <td>7</td>\n",
        "      <td>15</td>\n",
        "      <td>9</td>\n",
-       "      <td>-1.059686</td>\n",
+       "      <td>175449.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>141</td>\n",
        "      <td>58815.0</td>\n",
@@ -2854,7 +2837,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.813414</td>\n",
+       "      <td>2546</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377409</th>\n",
@@ -2877,7 +2860,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.813414</td>\n",
+       "      <td>2546</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377410</th>\n",
@@ -2886,7 +2869,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-1.086380</td>\n",
+       "      <td>173685.0</td>\n",
        "      <td>2003.0</td>\n",
        "      <td>109</td>\n",
        "      <td>291.0</td>\n",
@@ -2900,7 +2883,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>-1.200557</td>\n",
+       "      <td>273</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377411</th>\n",
@@ -2909,7 +2892,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.469367</td>\n",
+       "      <td>214459.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>109</td>\n",
        "      <td>291.0</td>\n",
@@ -2923,7 +2906,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>-1.200557</td>\n",
+       "      <td>273</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377412</th>\n",
@@ -2932,7 +2915,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.237462</td>\n",
+       "      <td>229784.0</td>\n",
        "      <td>2012.0</td>\n",
        "      <td>109</td>\n",
        "      <td>291.0</td>\n",
@@ -2946,7 +2929,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>-1.200557</td>\n",
+       "      <td>273</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377413</th>\n",
@@ -2969,7 +2952,7 @@
        "      <td>4</td>\n",
        "      <td>26.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>2.694478</td>\n",
+       "      <td>4669</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377414</th>\n",
@@ -2978,7 +2961,7 @@
        "      <td>7</td>\n",
        "      <td>15</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.273477</td>\n",
+       "      <td>227404.0</td>\n",
        "      <td>2015.0</td>\n",
        "      <td>109</td>\n",
        "      <td>1.0</td>\n",
@@ -2992,7 +2975,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>-0.058450</td>\n",
+       "      <td>1562</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377415</th>\n",
@@ -3001,7 +2984,7 @@
        "      <td>7</td>\n",
        "      <td>15</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.194026</td>\n",
+       "      <td>258298.0</td>\n",
        "      <td>2015.0</td>\n",
        "      <td>109</td>\n",
        "      <td>9.0</td>\n",
@@ -3015,7 +2998,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>-0.058450</td>\n",
+       "      <td>1562</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377416</th>\n",
@@ -3024,7 +3007,7 @@
        "      <td>0</td>\n",
        "      <td>5</td>\n",
        "      <td>7</td>\n",
-       "      <td>4.216968</td>\n",
+       "      <td>524146.0</td>\n",
        "      <td>2007.0</td>\n",
        "      <td>73</td>\n",
        "      <td>38.0</td>\n",
@@ -3038,7 +3021,7 @@
        "      <td>5</td>\n",
        "      <td>NaN</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.602479</td>\n",
+       "      <td>948</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377417</th>\n",
@@ -3047,7 +3030,7 @@
        "      <td>0</td>\n",
        "      <td>5</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.129380</td>\n",
+       "      <td>254026.0</td>\n",
        "      <td>1999.0</td>\n",
        "      <td>72</td>\n",
        "      <td>3.0</td>\n",
@@ -3061,7 +3044,7 @@
        "      <td>5</td>\n",
        "      <td>NaN</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.602479</td>\n",
+       "      <td>948</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -3085,15 +3068,15 @@
        "         song_length  song_year  first_genre_type  artist_count  \\\n",
        "0                NaN        NaN               152           NaN   \n",
        "1                NaN        NaN               152           NaN   \n",
-       "2          -0.303863     2006.0                31         186.0   \n",
+       "2           225396.0     2006.0                31         186.0   \n",
        "3                NaN        NaN               152           NaN   \n",
-       "4          -0.872754     2016.0                 2          18.0   \n",
+       "4           187802.0     2016.0                 2          18.0   \n",
        "...              ...        ...               ...           ...   \n",
        "7377413          NaN        NaN               152           NaN   \n",
-       "7377414    -0.273477     2015.0               109           1.0   \n",
-       "7377415     0.194026     2015.0               109           9.0   \n",
-       "7377416     4.216968     2007.0                73          38.0   \n",
-       "7377417     0.129380     1999.0                72           3.0   \n",
+       "7377414     227404.0     2015.0               109           1.0   \n",
+       "7377415     258298.0     2015.0               109           9.0   \n",
+       "7377416     524146.0     2007.0                73          38.0   \n",
+       "7377417     254026.0     1999.0                72           3.0   \n",
        "\n",
        "         composer_count  lyricist_count  first_genre_typecount  featured_song  \\\n",
        "0                   NaN             NaN                    NaN            NaN   \n",
@@ -3122,22 +3105,22 @@
        "7377417       1.0       0.0            52     5   NaN               9   \n",
        "\n",
        "         registration_duration  \n",
-       "0                     0.420898  \n",
-       "1                     0.596334  \n",
-       "2                     0.596334  \n",
-       "3                     0.596334  \n",
-       "4                     0.420898  \n",
+       "0                         2103  \n",
+       "1                         2301  \n",
+       "2                         2301  \n",
+       "3                         2301  \n",
+       "4                         2103  \n",
        "...                        ...  \n",
-       "7377413               2.694478  \n",
-       "7377414              -0.058450  \n",
-       "7377415              -0.058450  \n",
-       "7377416              -0.602479  \n",
-       "7377417              -0.602479  \n",
+       "7377413                   4669  \n",
+       "7377414                   1562  \n",
+       "7377415                   1562  \n",
+       "7377416                    948  \n",
+       "7377417                    948  \n",
        "\n",
        "[7377418 rows x 20 columns]"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3148,7 +3131,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3169,7 +3152,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
@@ -3178,18 +3161,18 @@
        "1"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "save(train, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\train_processed.pkl')"
+    "save(data_train, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\data_train.pkl')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
@@ -3198,32 +3181,61 @@
        "1"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "save(test, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\test_processed.pkl')"
+    "save(data_val, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\data_val.pkl')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 27,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "train_pkl = load(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\train.pkl')"
+    "save(label_train, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\label_train.pkl')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 28,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "test_pkl = load(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\test.pkl')"
+    "save(label_val, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\label_val.pkl')"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/tfn/notebooks/.ipynb_checkpoints/DataPreprocessing_FeatureEngineering_Pipeline_df1n19_SVM-checkpoint.ipynb b/tfn/notebooks/.ipynb_checkpoints/DataPreprocessing_FeatureEngineering_Pipeline_df1n19_SVM-checkpoint.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..229205f064ce571ee9fbcbd120ad15ce5efb053a
--- /dev/null
+++ b/tfn/notebooks/.ipynb_checkpoints/DataPreprocessing_FeatureEngineering_Pipeline_df1n19_SVM-checkpoint.ipynb
@@ -0,0 +1,509 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Debug record\n",
+    "1. def user_age(members): add else x at the end\n",
+    "2. def add_is_same_feature(songs): correct 'artist_name'\n",
+    "3. def songs_language_to_str(language):elif type(language) == float and not language.isnull():  # add  not language.isnull()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import necessary pacakges\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import datetime\n",
+    "import time\n",
+    "import re \n",
+    "\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn import preprocessing\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Loading Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# author@Fiona\n",
+    "# read data from csv files\n",
+    "members = pd.read_csv(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\members.csv',parse_dates=['registration_init_time','expiration_date'])\n",
+    "songs = pd.read_csv(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\songs.csv')\n",
+    "songs_extra = pd.read_csv(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\song_extra_info.csv')\n",
+    "train = pd.read_csv(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\train.csv')\n",
+    "test = pd.read_csv(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\test.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Outlier preprocess for members.expiration_date row 16867\n",
+    "\n",
+    "members.expiration_date[16867] = members.registration_init_time[16867]\n",
+    "members.expiration_date[16867]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Data Preprocessing + Feature Extraction\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# song test file generate\n",
+    "songs0 = pd.read_csv(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\songs.csv')\n",
+    "songs_extra0 = pd.read_csv(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\song_extra_info.csv')\n",
+    "\n",
+    "songs = songs0.head(1048575)\n",
+    "songs_extra = songs_extra0.head(1048575)\n",
+    "del songs0\n",
+    "del songs_extra0\n",
+    "\n",
+    "# merge songinfo file\n",
+    "songs = songs.merge(songs_extra, on = 'song_id', how = 'left')\n",
+    "del songs_extra\n",
+    "\n",
+    "songs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# author@Fiona\n",
+    "# song feature extraction functions \n",
+    "\n",
+    "# get song_year feature from isrc code\n",
+    "def isrc_to_year(isrc):\n",
+    "    if type(isrc) == str:\n",
+    "        if int(isrc[5:7]) > 17:\n",
+    "            return 1900 + int(isrc[5:7])\n",
+    "        else:\n",
+    "            return 2000 + int(isrc[5:7])\n",
+    "    else:\n",
+    "        return np.nan\n",
+    "\n",
+    "# add new feature \n",
+    "def add_song_year(songs):\n",
+    "    songs['song_year'] = songs['isrc'].apply(lambda attribute: isrc_to_year(attribute))\n",
+    "    return songs\n",
+    "    \n",
+    "# add genre id counts feature 'popular genres'\n",
+    "def add_first_genre_type(songs):\n",
+    "    songs['first_genre_type'] = songs.genre_ids.apply(str).apply(lambda attribute: attribute.split('|')[0])\n",
+    "    return songs\n",
+    "\n",
+    "# get song played counts\n",
+    "# 循环完成计数并添加到新字典变量song_played_counts中 \n",
+    "# 再把song_played_counts和原表根据song_id拼接起来\n",
+    "#dictionary used to save times a song is played\n",
+    "#key = song_id, value = number of times the song's played\n",
+    "\n",
+    "def song_played_counts(songs):\n",
+    "    song_played_counts_dic = {}\n",
+    "    for song_id in songs['song_id']:\n",
+    "        if song_id not in song_played_counts_dic:\n",
+    "            song_played_counts_dic[song_id] = 1\n",
+    "        else:\n",
+    "            song_played_counts_dic[song_id] += 1\n",
+    "    \n",
+    "    return song_played_counts_dic\n",
+    "\n",
+    "# add song played counts feature 'hit songs'\n",
+    "def add_song_played_times(songs):\n",
+    "    song_played_times = song_played_counts(songs)\n",
+    "    new_song_played_times = pd.DataFrame(pd.Series(song_played_times), columns = ['song_played_times'])\n",
+    "    new_song_played_times = new_song_played_times.reset_index().rename(columns = {'index' : 'song_id'})\n",
+    "    songs = songs.merge(new_song_played_times, on = 'song_id', how = 'left')\n",
+    "    return songs\n",
+    "    \n",
+    "# add artist counts feature 'hot artist'\n",
+    "def add_artist_counts(songs):\n",
+    "    artistcount = songs.groupby(['artist_name'],as_index=False)['artist_name'].agg({'artist_count':'count'})\n",
+    "    songs = songs.merge(artistcount, on = 'artist_name', how = 'left')\n",
+    "    return songs\n",
+    "\n",
+    "# add composer played counts\n",
+    "## Count the number of times the composer has been listened\n",
+    "def add_composer_counts(songs):\n",
+    "    composercount = songs.groupby(['composer'],as_index=False)['composer'].agg({'composer_count':'count'})\n",
+    "    songs = songs.merge(composercount, on = 'composer', how = 'left')\n",
+    "    return songs\n",
+    "\n",
+    "\n",
+    "#add lyricist played counts\n",
+    "# Count the number of times the lyricist has been listened\n",
+    "def add_lyricist_counts(songs):\n",
+    "    lyricistcount = songs.groupby(['lyricist'],as_index=False)['lyricist'].agg({'lyricist_count':'count'})\n",
+    "    songs = songs.merge(lyricistcount, on = 'lyricist', how = 'left')\n",
+    "    return songs\n",
+    "\n",
+    "# add genre type counts 'popular genres'\n",
+    "# 分组要使用first genre type属性\n",
+    "def add_genere_counts(songs):\n",
+    "    genrecount = songs.groupby(['first_genre_type'],as_index = False)['first_genre_type'].agg({'first_genre_typecount':'count'})\n",
+    "    songs = songs.merge(genrecount, on = 'first_genre_type', how = 'left')\n",
+    "    return songs\n",
+    "\n",
+    "\n",
+    "# add feat feature \n",
+    "def add_feat_feature(songs):\n",
+    "    songs['featured_song'] = songs.apply(lambda attribute : 1 if 'feat' in str(attribute['artist_name'])else 0,\n",
+    "                                        axis = 1)\n",
+    "    return songs\n",
+    "    \n",
+    "\n",
+    "# add if_artist_composer_lyricist_are_same feature:\n",
+    "def add_is_same_feature(songs):\n",
+    "    songs['same_c_l'] = songs.apply(lambda attribute : 1 if attribute['composer'] == attribute['lyricist'] else 0, \n",
+    "                                    axis = 1 )\n",
+    "    songs['all_same'] = songs.apply(lambda attribute : 1 if attribute['artist_name'] == attribute['composer'] and \n",
+    "                                   attribute['composer'] == attribute['lyricist'] else 0, axis = 1)\n",
+    "    return songs\n",
+    "\n",
+    "\n",
+    "#deal with missing values\n",
+    "def songs_language_to_str(language):\n",
+    "    if language == -1.0 or np.isnan(language):\n",
+    "        return np.nan\n",
+    "    elif type(language) == float:\n",
+    "        return str(int(language)) \n",
+    "    else:\n",
+    "        return np.nan\n",
+    "    \n",
+    "def missing_value(songs):\n",
+    "    #deal with song_length missing value\n",
+    "    songs.song_length.fillna(songs.song_length.mean(), inplace = True)\n",
+    "    # deal with language missing value\n",
+    "    songs['song_language'] = songs['language'].apply(songs_language_to_str)\n",
+    "    return songs\n",
+    "\n",
+    "    \n",
+    "#delete unuseful features\n",
+    "def del_unuseful_feature(songs):\n",
+    "    songs = songs.drop('isrc', axis = 1)\n",
+    "    songs = songs.drop('name', axis = 1)\n",
+    "    songs = songs.drop('genre_ids', axis = 1)\n",
+    "    songs = songs.drop(['artist_name', 'composer', 'lyricist'], axis = 1)\n",
+    "    songs = songs.drop('language', axis = 1)\n",
+    "    return songs\n",
+    "    \n",
+    "# finish song features extraction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# author@Karen\n",
+    "# member features extraction\n",
+    "# Convert time string to timestamp for calculate\n",
+    "def add_registration_duration(members):\n",
+    "    members.registration_init_time = members.registration_init_time.apply(lambda x : time.mktime(x.timetuple()))\n",
+    "    members.expiration_date = members.expiration_date.apply(lambda x : time.mktime(x.timetuple()))\n",
+    "    #count Duration\n",
+    "    members['registration_duration'] = ((members['expiration_date'] - members['registration_init_time'])/(24*60*60))\n",
+    "    members['registration_duration'] = members['registration_duration'].apply(int)\n",
+    "    return members\n",
+    "\n",
+    "#print (duration(members))\n",
+    "\n",
+    "#Using NAN replace the bd = 0\n",
+    "def user_age(members):\n",
+    "    members['bd'] = members['bd'].replace(0,np.nan)\n",
+    "    members['bd'] = members['bd'].apply(lambda x: np.nan if x < 12 or x > 65 else x) # else x\n",
+    "    return members\n",
+    "\n",
+    "#print (age(members))\n",
+    "\n",
+    "#drop unnecessary features\n",
+    "def delete_unuseful_features(members):\n",
+    "    members = members.drop(\"registration_init_time\",axis = 1)\n",
+    "    members = members.drop(\"expiration_date\",axis = 1)\n",
+    "    members = members.drop(\"gender\",axis = 1) \n",
+    "    return members\n",
+    "\n",
+    "# finish member features extraction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#train and test feature extraction\n",
+    "\n",
+    "#feature_pipeline_song\n",
+    "feature_pipeline_song = [add_song_year, add_first_genre_type, add_artist_counts, add_composer_counts, \n",
+    "                         add_lyricist_counts, add_genere_counts, add_feat_feature, add_is_same_feature,\n",
+    "                         missing_value, del_unuseful_feature]\n",
+    "\n",
+    "# feature_pipeline_members \n",
+    "feature_pipeline_members = [add_registration_duration, user_age, delete_unuseful_features]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "songs.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#apply pipeline functions\n",
+    "def apply_pipeline(data, pipelinefunctions):\n",
+    "    for function in pipelinefunctions:\n",
+    "        data = function(data)\n",
+    "    return data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "songs = apply_pipeline(songs, feature_pipeline_song)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "songs.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "members = apply_pipeline(members, feature_pipeline_members)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "members.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# user operation feature extraction functions\n",
+    "train = train.merge(songs, on = 'song_id', how = 'left')\n",
+    "train = train.merge(members, on = 'msno', how = 'left')\n",
+    "\n",
+    "test = test.merge(songs, on = 'song_id', how = 'left')\n",
+    "test = test.merge(members, on = 'msno', how = 'left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#labelencoding operation features\n",
+    "le = preprocessing.LabelEncoder()\n",
+    "transfer = ['msno', 'song_id', 'first_genre_type', 'source_system_tab', 'source_screen_name', 'source_type']\n",
+    "\n",
+    "def labelencoding(data, transfer):\n",
+    "    for i in transfer:\n",
+    "        data[i] = np.array(le.fit_transform(data[i].tolist()))\n",
+    "\n",
+    "\n",
+    "labelencoding(train, transfer)\n",
+    "labelencoding(test, transfer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Standardize numerical data: registration_duration & song_length\n",
+    "\n",
+    "transfer_list = ['song_length','registration_duration']\n",
+    "def standardize(data, transfer_list):\n",
+    "    for i in transfer_list:\n",
+    "        data[i] = preprocessing.scale(data[i], axis=0, with_mean=True, with_std=True, copy=True)\n",
+    "        \n",
+    "standardize(train, transfer_list)\n",
+    "standardize(test, transfer_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Sparse label data from train dataset\n",
+    "\n",
+    "label = train.target.tolist()\n",
+    "train = train.drop('target', axis = 1)\n",
+    "\n",
+    "# training and validation dataset split 70:30 & shuffle\n",
+    "\n",
+    "data_train, data_val, label_train, label_val = train_test_split(train, label, test_size=0.3, shuffle=True )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save and load the data using pickle (Optional)\n",
+    "import pickle\n",
+    "\n",
+    "def save(obj, filename):\n",
+    "    with open(filename, 'wb') as output:  # Overwrites any existing file.\n",
+    "        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)\n",
+    "    return 1\n",
+    "\n",
+    "##-- Load obj from file    \n",
+    "def load(filename):\n",
+    "    with open(filename, 'rb') as input: \n",
+    "        obj = pickle.load(input)\n",
+    "    return obj   "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "save(train, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\train_processed.pkl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "save(test, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\test_processed.pkl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_pkl = load(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\train.pkl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_pkl = load(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\test.pkl')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tfn/notebooks/.ipynb_checkpoints/EDA_continued-checkpoint.ipynb b/tfn/notebooks/.ipynb_checkpoints/EDA_continued-checkpoint.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..3d0129935bfa1f75628eb9c93e58c4e34a2b180a
--- /dev/null
+++ b/tfn/notebooks/.ipynb_checkpoints/EDA_continued-checkpoint.ipynb
@@ -0,0 +1,2061 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Member Data Exploration"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## STEPS:\n",
+    "1. Transform datatime to string\n",
+    "2. Calculate the amount of 'NaN'\n",
+    "3. Data summarization -- > distinct data\n",
+    "4. bd -- > find out outliers\n",
+    "5. Distribution of categorical data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Train Data Exploration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train = pd.read_csv('D:/Project/DS/Data Mining/cw1/kkbox_recommendation/data/train.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>msno</th>\n",
+       "      <th>song_id</th>\n",
+       "      <th>source_system_tab</th>\n",
+       "      <th>source_screen_name</th>\n",
+       "      <th>source_type</th>\n",
+       "      <th>target</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
+       "      <td>BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=</td>\n",
+       "      <td>explore</td>\n",
+       "      <td>Explore</td>\n",
+       "      <td>online-playlist</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
+       "      <td>bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=</td>\n",
+       "      <td>my library</td>\n",
+       "      <td>Local playlist more</td>\n",
+       "      <td>local-playlist</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
+       "      <td>JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=</td>\n",
+       "      <td>my library</td>\n",
+       "      <td>Local playlist more</td>\n",
+       "      <td>local-playlist</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
+       "      <td>2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=</td>\n",
+       "      <td>my library</td>\n",
+       "      <td>Local playlist more</td>\n",
+       "      <td>local-playlist</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
+       "      <td>3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=</td>\n",
+       "      <td>explore</td>\n",
+       "      <td>Explore</td>\n",
+       "      <td>online-playlist</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                           msno  \\\n",
+       "0  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
+       "1  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
+       "2  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
+       "3  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
+       "4  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
+       "\n",
+       "                                        song_id source_system_tab  \\\n",
+       "0  BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=           explore   \n",
+       "1  bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=        my library   \n",
+       "2  JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=        my library   \n",
+       "3  2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=        my library   \n",
+       "4  3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=           explore   \n",
+       "\n",
+       "    source_screen_name      source_type  target  \n",
+       "0              Explore  online-playlist       1  \n",
+       "1  Local playlist more   local-playlist       1  \n",
+       "2  Local playlist more   local-playlist       1  \n",
+       "3  Local playlist more   local-playlist       1  \n",
+       "4              Explore  online-playlist       1  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "msno                  7377418\n",
+       "song_id               7377418\n",
+       "source_system_tab     7352569\n",
+       "source_screen_name    6962614\n",
+       "source_type           7355879\n",
+       "target                7377418\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def summarize(data): \n",
+    "    summary = dict()    \n",
+    "\n",
+    "    total_row, total_column = data.shape\n",
+    "    for column in data.columns:\n",
+    "        summary[column] = data[column].nunique() # len(set(data[column]))--> missing data is also counted\n",
+    "    \n",
+    "    print('Total number of records:{}'.format(total_row))\n",
+    "    for key, value in summary.items():\n",
+    "        print('The distinct {} in records:{}'.format(key, value))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def count_nan(data):\n",
+    "    total_row, total_column = data.shape\n",
+    "    total_nan = [(0, 0) for _ in range(total_column)]\n",
+    "                 \n",
+    "    df = pd.DataFrame(total_nan, columns = ['Nan_count', 'Percentage'], index = data.columns)\n",
+    "   # result.astype({'Percentage':float})\n",
+    "\n",
+    "    for column in data.columns:\n",
+    "        number_nan = data[column].isnull().sum()\n",
+    "        df.loc[column][0] = number_nan\n",
+    "        df.loc[column][1] = number_nan/total_row * 100\n",
+    "    convert_dict = {\n",
+    "        'Nan_count': int,\n",
+    "        'Percentage': float\n",
+    "    }\n",
+    "\n",
+    "    df = df.astype(convert_dict)\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1.1 Get the distinct count number in train data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['explore', 'my library', 'search', 'discover', nan, 'radio',\n",
+       "       'listen with', 'notification', 'settings'], dtype=object)"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train.source_system_tab.unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>target</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>7.377418e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>5.035171e-01</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>4.999877e-01</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>1.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>1.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>1.000000e+00</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             target\n",
+       "count  7.377418e+06\n",
+       "mean   5.035171e-01\n",
+       "std    4.999877e-01\n",
+       "min    0.000000e+00\n",
+       "25%    0.000000e+00\n",
+       "50%    1.000000e+00\n",
+       "75%    1.000000e+00\n",
+       "max    1.000000e+00"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.007342677343211405\n"
+     ]
+    }
+   ],
+   "source": [
+    "n = len(train.source_screen_name)\n",
+    "count = 0\n",
+    "for test in train.source_screen_name:\n",
+    "    if test == 'Unknown':\n",
+    "        count += 1\n",
+    "print(count/n)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total number of records:7377418\n",
+      "The distinct msno in records:30755\n",
+      "The distinct song_id in records:359966\n",
+      "The distinct source_system_tab in records:8\n",
+      "The distinct source_screen_name in records:20\n",
+      "The distinct source_type in records:12\n",
+      "The distinct target in records:2\n"
+     ]
+    }
+   ],
+   "source": [
+    "summarize(train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 1.2 Get the total number of missing data in member data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Nan_count</th>\n",
+       "      <th>Percentage</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>msno</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>song_id</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>source_system_tab</th>\n",
+       "      <td>24849</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>source_screen_name</th>\n",
+       "      <td>414804</td>\n",
+       "      <td>5.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>source_type</th>\n",
+       "      <td>21539</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>target</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                    Nan_count  Percentage\n",
+       "msno                        0         0.0\n",
+       "song_id                     0         0.0\n",
+       "source_system_tab       24849         0.0\n",
+       "source_screen_name     414804         5.0\n",
+       "source_type             21539         0.0\n",
+       "target                      0         0.0"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "count_nan(train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a = train.groupby('source_type').size()\n",
+    "a = pd.DataFrame(a)\n",
+    "a.reset_index(level=0, inplace=True)\n",
+    "a.columns = ['Source System Tab', 'Count']\n",
+    "a = a.sort_values(by='Count', ascending=False)\n",
+    "print(a)\n",
+    "plt.rcParams[\"axes.labelsize\"] = 15\n",
+    "ax = sns.catplot(x='Source System Tab', y='Count', kind='bar',\n",
+    "                 data=a, height=6, palette='ch:2.5,-.2,dark=.4', aspect=1.5)\n",
+    "ax.fig.suptitle('Distribution of Source System Tab', fontsize=15)\n",
+    "ax.fig.subplots_adjust(top=.9)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Source System Tab    Count\n",
+      "3        my library  3684730\n",
+      "0          discover  2179252\n",
+      "6            search   623286\n",
+      "5             radio   476701\n",
+      "2       listen with   212266\n",
+      "1           explore   167949\n",
+      "4      notification     6185\n",
+      "7          settings     2200\n"
+     ]
+    },
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 648x432 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "a = train.groupby('source_system_tab').size()\n",
+    "a = pd.DataFrame(a)\n",
+    "a.reset_index(level=0, inplace=True)\n",
+    "a.columns = ['Source System Tab', 'Count']\n",
+    "a = a.sort_values(by='Count', ascending=False)\n",
+    "print(a)\n",
+    "plt.rcParams[\"axes.labelsize\"] = 15\n",
+    "ax = sns.catplot(x='Source System Tab', y='Count', kind='bar',\n",
+    "                 data=a, height=6, palette='ch:2.5,-.2,dark=.4', aspect=1.5)\n",
+    "ax.fig.suptitle('Distribution of Source System Tab', fontsize=15)\n",
+    "ax.fig.subplots_adjust(top=.9)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                     target\n",
+      "source_system_tab          \n",
+      "my library         0.619659\n",
+      "settings           0.590909\n",
+      "explore            0.422146\n",
+      "search             0.421362\n",
+      "discover           0.415770\n",
+      "notification       0.378011\n",
+      "listen with        0.326581\n",
+      "radio              0.222662\n"
+     ]
+    },
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 648x432 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "repeat_df = train.groupby(['source_system_tab']).mean(\n",
+    ").sort_values(by='target', ascending=False)\n",
+    "print(repeat_df)\n",
+    "\n",
+    "repeat_df.reset_index(level=0, inplace=True)\n",
+    "repeat_df.columns = ['Source System Tab', 'Target']\n",
+    "plt.rcParams[\"axes.labelsize\"] = 15\n",
+    "ax = sns.catplot(x='Source System Tab', y='Target', kind='bar', palette=\"ch:7,-.2,dark=.4\",\n",
+    "                 data=repeat_df, height=6, aspect=1.5)\n",
+    "ax.fig.subplots_adjust(top=.9)\n",
+    "ax.fig.suptitle('Source System Tab v/s Target', fontsize=15)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 756x504 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "Type = train.groupby(['song_id']).sum().sort_values(\n",
+    "    by='target', ascending=False)\n",
+    "Type.reset_index(level=0, inplace=True)\n",
+    "Type.columns = ['Source Type', 'Views/user']\n",
+    "\n",
+    "new = Type.groupby(['Views/user']).count()\n",
+    "new.reset_index(inplace=True)\n",
+    "new.columns = ['Views/user', 'Log_Count']\n",
+    "new['Log_Count'] = np.log(new.Log_Count)\n",
+    "plt.rcParams[\"axes.labelsize\"] = 15\n",
+    "ax = sns.catplot(x='Views/user', y='Log_Count', kind='strip', color='blue', height=7, aspect=1.5,\n",
+    "                 data=new)\n",
+    "ax.fig.subplots_adjust(top=.9)\n",
+    "ax.fig.suptitle('Distribution of Views', fontsize=15)\n",
+    "\n",
+    "for a in ax.axes.flat:\n",
+    "    labels = a.get_xticklabels() # get x labels\n",
+    "    for i,l in enumerate(labels):\n",
+    "        if(i%100 != 0): labels[i] = '' # skip non-10 digits labels\n",
+    "    ax.set_xticklabels(labels, rotation = 0) # set new lab\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Member Data Exploration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "member = pd.read_csv('members.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>msno</th>\n",
+       "      <th>city</th>\n",
+       "      <th>bd</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>registered_via</th>\n",
+       "      <th>registration_init_time</th>\n",
+       "      <th>expiration_date</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NA</td>\n",
+       "      <td>7</td>\n",
+       "      <td>2011-08-20</td>\n",
+       "      <td>2017-09-20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NA</td>\n",
+       "      <td>7</td>\n",
+       "      <td>2015-06-28</td>\n",
+       "      <td>2017-06-22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NA</td>\n",
+       "      <td>4</td>\n",
+       "      <td>2016-04-11</td>\n",
+       "      <td>2017-07-12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NA</td>\n",
+       "      <td>9</td>\n",
+       "      <td>2015-09-06</td>\n",
+       "      <td>2015-09-07</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NA</td>\n",
+       "      <td>4</td>\n",
+       "      <td>2017-01-26</td>\n",
+       "      <td>2017-06-13</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                           msno  city  bd gender  \\\n",
+       "0  XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=     1   0     NA   \n",
+       "1  UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=     1   0     NA   \n",
+       "2  D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=     1   0     NA   \n",
+       "3  mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=     1   0     NA   \n",
+       "4  q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=     1   0     NA   \n",
+       "\n",
+       "   registered_via registration_init_time expiration_date  \n",
+       "0               7             2011-08-20      2017-09-20  \n",
+       "1               7             2015-06-28      2017-06-22  \n",
+       "2               4             2016-04-11      2017-07-12  \n",
+       "3               9             2015-09-06      2015-09-07  \n",
+       "4               4             2017-01-26      2017-06-13  "
+      ]
+     },
+     "execution_count": 85,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "member.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "songs = pd.read_csv('songs.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>song_id</th>\n",
+       "      <th>song_length</th>\n",
+       "      <th>genre_ids</th>\n",
+       "      <th>artist_name</th>\n",
+       "      <th>composer</th>\n",
+       "      <th>lyricist</th>\n",
+       "      <th>language</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=</td>\n",
+       "      <td>247640</td>\n",
+       "      <td>465</td>\n",
+       "      <td>張信哲 (Jeff Chang)</td>\n",
+       "      <td>董貞</td>\n",
+       "      <td>何啟弘</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=</td>\n",
+       "      <td>197328</td>\n",
+       "      <td>444</td>\n",
+       "      <td>BLACKPINK</td>\n",
+       "      <td>TEDDY|  FUTURE BOUNCE|  Bekuh BOOM</td>\n",
+       "      <td>TEDDY</td>\n",
+       "      <td>31.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=</td>\n",
+       "      <td>231781</td>\n",
+       "      <td>465</td>\n",
+       "      <td>SUPER JUNIOR</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>31.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=</td>\n",
+       "      <td>273554</td>\n",
+       "      <td>465</td>\n",
+       "      <td>S.H.E</td>\n",
+       "      <td>湯小康</td>\n",
+       "      <td>徐世珍</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=</td>\n",
+       "      <td>140329</td>\n",
+       "      <td>726</td>\n",
+       "      <td>貴族精選</td>\n",
+       "      <td>Traditional</td>\n",
+       "      <td>Traditional</td>\n",
+       "      <td>52.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                        song_id  song_length genre_ids  \\\n",
+       "0  CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=       247640       465   \n",
+       "1  o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=       197328       444   \n",
+       "2  DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=       231781       465   \n",
+       "3  dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=       273554       465   \n",
+       "4  W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=       140329       726   \n",
+       "\n",
+       "        artist_name                            composer     lyricist  language  \n",
+       "0  張信哲 (Jeff Chang)                                  董貞          何啟弘       3.0  \n",
+       "1         BLACKPINK  TEDDY|  FUTURE BOUNCE|  Bekuh BOOM        TEDDY      31.0  \n",
+       "2      SUPER JUNIOR                                 NaN          NaN      31.0  \n",
+       "3             S.H.E                                 湯小康          徐世珍       3.0  \n",
+       "4              貴族精選                         Traditional  Traditional      52.0  "
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "songs.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>msno</th>\n",
+       "      <th>city</th>\n",
+       "      <th>bd</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>registered_via</th>\n",
+       "      <th>registration_init_time</th>\n",
+       "      <th>expiration_date</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>7</td>\n",
+       "      <td>20110820</td>\n",
+       "      <td>20170920</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>7</td>\n",
+       "      <td>20150628</td>\n",
+       "      <td>20170622</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4</td>\n",
+       "      <td>20160411</td>\n",
+       "      <td>20170712</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>9</td>\n",
+       "      <td>20150906</td>\n",
+       "      <td>20150907</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4</td>\n",
+       "      <td>20170126</td>\n",
+       "      <td>20170613</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                           msno  city  bd gender  \\\n",
+       "0  XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=     1   0    NaN   \n",
+       "1  UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=     1   0    NaN   \n",
+       "2  D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=     1   0    NaN   \n",
+       "3  mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=     1   0    NaN   \n",
+       "4  q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=     1   0    NaN   \n",
+       "\n",
+       "   registered_via  registration_init_time  expiration_date  \n",
+       "0               7                20110820         20170920  \n",
+       "1               7                20150628         20170622  \n",
+       "2               4                20160411         20170712  \n",
+       "3               9                20150906         20150907  \n",
+       "4               4                20170126         20170613  "
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "member.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "member.registration_init_time = pd.to_datetime(member.registration_init_time.astype('str'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "member.expiration_date = pd.to_datetime(member.expiration_date.astype('str'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>city</th>\n",
+       "      <th>bd</th>\n",
+       "      <th>registered_via</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>34403.000000</td>\n",
+       "      <td>34403.000000</td>\n",
+       "      <td>34403.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>5.371276</td>\n",
+       "      <td>12.280935</td>\n",
+       "      <td>5.953376</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>6.243929</td>\n",
+       "      <td>18.170251</td>\n",
+       "      <td>2.287534</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>-43.000000</td>\n",
+       "      <td>3.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>4.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>7.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>10.000000</td>\n",
+       "      <td>25.000000</td>\n",
+       "      <td>9.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>22.000000</td>\n",
+       "      <td>1051.000000</td>\n",
+       "      <td>16.000000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "               city            bd  registered_via\n",
+       "count  34403.000000  34403.000000    34403.000000\n",
+       "mean       5.371276     12.280935        5.953376\n",
+       "std        6.243929     18.170251        2.287534\n",
+       "min        1.000000    -43.000000        3.000000\n",
+       "25%        1.000000      0.000000        4.000000\n",
+       "50%        1.000000      0.000000        7.000000\n",
+       "75%       10.000000     25.000000        9.000000\n",
+       "max       22.000000   1051.000000       16.000000"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "member.describe()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2.1 Get the distinct count number in member data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total number of records:34403\n",
+      "The distinct msno in records:34403\n",
+      "The distinct city in records:21\n",
+      "The distinct bd in records:95\n",
+      "The distinct gender in records:2\n",
+      "The distinct registered_via in records:6\n",
+      "The distinct registration_init_time in records:3862\n",
+      "The distinct expiration_date in records:1484\n"
+     ]
+    }
+   ],
+   "source": [
+    "summarize(member)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2.2 Get the total number of missing data in member data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Nan_count</th>\n",
+       "      <th>Percentage</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>msno</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>city</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>bd</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>gender</th>\n",
+       "      <td>19902</td>\n",
+       "      <td>57.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>registered_via</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>registration_init_time</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>expiration_date</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                        Nan_count  Percentage\n",
+       "msno                            0         0.0\n",
+       "city                            0         0.0\n",
+       "bd                              0         0.0\n",
+       "gender                      19902        57.0\n",
+       "registered_via                  0         0.0\n",
+       "registration_init_time          0         0.0\n",
+       "expiration_date                 0         0.0"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "count_nan(member)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2.3 The distribution of categorical variables"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### The distribution of age"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'index' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-35-c17665332d8f>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mindex\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m: name 'index' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bd = member.groupby(['bd']).size()\n",
+    "bd = pd.DataFrame(bd)\n",
+    "bd.reset_index(level=0, inplace=True)\n",
+    "bd.columns = ['Age', 'Count']\n",
+    "\n",
+    "plt.rcParams[\"axes.labelsize\"] = 15\n",
+    "ax = sns.catplot(x='Age', y='Count', kind='bar', palette=\"ch:2,0.3,dark=.4\",\n",
+    "                 data=bd, height=6, aspect=1.5)\n",
+    "#ax.fig.subplots_adjust(top=.9)\n",
+    "ax.fig.suptitle('Distribution of member age', fontsize=15)\n",
+    "\n",
+    "for a in ax.axes.flat:\n",
+    "    labels = a.get_xticklabels() # get x labels\n",
+    "    for i,l in enumerate(labels):\n",
+    "        if(i%2 != 0): labels[i] = '' # skip non-10 digits labels\n",
+    "    ax.set_xticklabels(labels, rotation = 0) # set new lab\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bd.describe()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### The distribution of age > 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "bd = member.groupby(['bd']).size()\n",
+    "bd = pd.DataFrame(bd)\n",
+    "bd.reset_index(level=0, inplace=True)\n",
+    "bd.columns = ['Age', 'Count']\n",
+    "bd = bd[(bd['Age'] > 0) & (bd['Age'] <= 80)  ]\n",
+    "\n",
+    "plt.rcParams[\"axes.labelsize\"] = 15\n",
+    "ax = sns.catplot(x='Age', y='Count', kind='bar', palette=\"ch:2,0.3,dark=.4\",\n",
+    "                 data=bd, height=6, aspect=1.5)\n",
+    "#ax.fig.subplots_adjust(top=.9)\n",
+    "ax.fig.suptitle('Distribution of member age', fontsize=15)\n",
+    "for a in ax.axes.flat:\n",
+    "    labels = a.get_xticklabels() # get x labels\n",
+    "    for i,l in enumerate(labels):\n",
+    "        if(i%2 != 0): labels[i] = '' # skip non-10 digits labels\n",
+    "    ax.set_xticklabels(labels, rotation = 0) # set new lab\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bd.describe()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Distribution of City IDs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAoAAAAGoCAYAAADW2lTlAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3dfbhldV338fcnRswnEnAgZDDQa7TMuxAnpFRSSRjIGKk0uLtlSronTSrTStTuwKeiQi2vDEMZgVKQRGM0nkZS7AGQAZEHwRiU5Mg0M0o+FIWB3/uP/Tu1nTnnzBxmr332zHq/rmtfe+3fWvv3/a2ZM2s+Z6312ztVhSRJkvrjuxZ6AJIkSRovA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknlm00AOYBMuXL6/LLrtsoYchSZI0apmp0TOAwFe+8pWFHoIkSdLYGAAlSZJ6xgAoSZLUMwZASZKknhlrAExyQJJPJLktya1Jfr2175VkbZI72vOerT1J3plkfZKbkhwy1NfKtv0dSVYOtT8jyc3tPe9MMuPNj5IkSX017jOADwCvqaofAA4DXpnkqcApwJVVtRS4sr0GOBpY2h6rgDNhEBiBU4FnAocCp06HxrbNqqH3LR/DfkmSJO00xhoAq2pDVd3Qlr8J3AbsD6wAzm2bnQu8qC2vAM6rgWuAxybZDzgKWFtV91bVvwJrgeVt3R5VdXVVFXDeUF+SJEliAe8BTHIg8HTgWmDfqtoAg5AI7NM22x+4e+htU61trvapGdpnqr8qybok6zZv3ryjuyNJkrTTWJAAmOTRwEXAq6rqG3NtOkNbPYT2rRurzqqqZVW1bPHixdsasiRJ0i5j7AEwycMYhL/3V9WHW/PGdvmW9ryptU8BBwy9fQlwzzbal8zQLkmSpGbcs4ADnA3cVlVvH1q1BpieybsSuHio/cQ2G/gw4OvtEvHlwJFJ9myTP44ELm/rvpnksFbrxKG+JEmSxPi/C/hZwEuBm5Pc2NpeD5wOXJjkJOBLwIvbukuAY4D1wH3ALwJU1b1J3gxc17Z7U1Xd25ZfAZwDPAK4tD0kSZLUZDBZtt+WLVtW69atW+hhSJIkjdqMn4fsN4FIkiT1jAFQkiSpZwyAkiRJPWMAlCRJ6plxzwKeeLff8fcj7/P7lz575H1KkiQ9VJ4BlCRJ6hkDoCRJUs8YACVJknrGAChJktQzBkBJkqSeMQBKkiT1jAFQkiSpZwyAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknrGAChJktQzBkBJkqSeMQBKkiT1jAFQkiSpZwyAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknrGAChJktQzBkBJkqSeMQBKkiT1jAFQkiSpZwyAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknrGAChJktQzBkBJkqSeMQBKkiT1jAFQkiSpZwyAkiRJPTPWAJhkdZJNSW4Zavtgkhvb464kN7b2A5P8x9C6dw+95xlJbk6yPsk7k6S175VkbZI72vOe49w/SZKkncG4zwCeAywfbqiqn6uqg6vqYOAi4MNDq++cXldVLx9qPxNYBSxtj+k+TwGurKqlwJXttSRJkoaMNQBW1aeAe2da187ivQQ4f64+kuwH7FFVV1dVAecBL2qrVwDntuVzh9olSZLUTNI9gM8BNlbVHUNtByX5TJKrkjynte0PTA1tM9XaAPatqg0A7Xmf2YolWZVkXZJ1mzdvHt1eSJIkTbhJCoAn8J1n/zYAT6iqpwOvBj6QZA8gM7y35lusqs6qqmVVtWzx4sUPacCSJEk7o0ULPQCAJIuAnwaeMd1WVfcD97fl65PcCTyZwRm/JUNvXwLc05Y3Jtmvqja0S8WbxjF+SZKkncmknAH8CeD2qvrvS7tJFifZrS0/kcFkjy+0S7vfTHJYu2/wRODi9rY1wMq2vHKoXZIkSc24PwbmfOBq4ClJppKc1FYdz9aTPw4HbkryWeBDwMuranoCySuA9wLrgTuBS1v76cALktwBvKC9liRJ0pCxXgKuqhNmaf+FGdouYvCxMDNtvw542gztXwWO2LFRSpIk7dom5RKwJEmSxsQAKEmS1DMGQEmSpJ4xAEqSJPWMAVCSJKlnDICSJEk9YwCUJEnqGQOgJElSzxgAJUmSesYAKEmS1DMGQEmSpJ4xAEqSJPWMAVCSJKlnDICSJEk9YwCUJEnqGQOgJElSzxgAJUmSesYAKEmS1DMGQEmSpJ4xAEqSJPWMAVCSJKlnDICSJEk9YwCUJEnqGQOgJElSzxgAJUmSesYAKEmS1DMGQEmSpJ4xAEqSJPWMAVCSJKlnDICSJEk9YwCUJEnqGQOgJElSzxgAJUmSesYAKEmS1DMGQEmSpJ4xAEqSJPXMWANgktVJNiW5ZajttCRfTnJjexwztO51SdYn+XySo4bal7e29UlOGWo/KMm1Se5I8sEku49v7yRJknYO4z4DeA6wfIb2d1TVwe1xCUCSpwLHAz/Y3vNnSXZLshvwLuBo4KnACW1bgD9ofS0F/hU4qdO9kSRJ2gmNNQBW1aeAe7dz8xXABVV1f1V9EVgPHNoe66vqC1X1LeACYEWSAM8HPtTefy7wopHugCRJ0i5gUu4BPDnJTe0S8Z6tbX/g7qFtplrbbO17A1+rqge2aJ9RklVJ1iVZt3nz5lHthyRJ0sSbhAB4JvAk4GBgA/C21p4Ztq2H0D6jqjqrqpZV1bLFixfPb8SSJEk7sUULPYCq2ji9nOQ9wMfayynggKFNlwD3tOWZ2r8CPDbJonYWcHh7SZIkNQt+BjDJfkMvjwOmZwivAY5P8vAkBwFLgU8D1wFL24zf3RlMFFlTVQV8AvjZ9v6VwMXj2AdJkqSdyVjPACY5H3gu8LgkU8CpwHOTHMzgcu1dwC8DVNWtSS4EPgc8ALyyqh5s/ZwMXA7sBqyuqltbidcCFyR5C/AZ4Owx7ZokSdJOI4MTZ/22bNmyWrduHQC33/H3I+//+5c+e+R9SpIkbYeZ5kgs/CVgSZIkjZcBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPjDUAJlmdZFOSW4ba/ijJ7UluSvKRJI9t7Qcm+Y8kN7bHu4fe84wkNydZn+SdSdLa90qyNskd7XnPce6fJEnSzmDcZwDPAZZv0bYWeFpV/RDwT8DrhtbdWVUHt8fLh9rPBFYBS9tjus9TgCurailwZXstSZKkIWMNgFX1KeDeLdquqKoH2strgCVz9ZFkP2CPqrq6qgo4D3hRW70COLctnzvULkmSpGbS7gF8GXDp0OuDknwmyVVJntPa9gemhraZam0A+1bVBoD2vM9shZKsSrIuybrNmzePbg8kSZIm3MQEwCRvAB4A3t+aNgBPqKqnA68GPpBkDyAzvL3mW6+qzqqqZVW1bPHixQ912JIkSTudRQs9AIAkK4EXAke0y7pU1f3A/W35+iR3Ak9mcMZv+DLxEuCetrwxyX5VtaFdKt40rn2QJEnaWSz4GcAky4HXAsdW1X1D7YuT7NaWn8hgsscX2qXdbyY5rM3+PRG4uL1tDbCyLa8capckSVIz1jOASc4Hngs8LskUcCqDWb8PB9a2T3O5ps34PRx4U5IHgAeBl1fV9ASSVzCYUfwIBvcMTt83eDpwYZKTgC8BLx7DbkmSJO1UxhoAq+qEGZrPnmXbi4CLZlm3DnjaDO1fBY7YkTFKkiTt6hb8ErAkSZLGywAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPbHcATHJikr1nWbdXkhNHNyxJkiR1ZT5nAN8HPGmWdQe19ZIkSZpw8wmAmWPd3sA3dnAskiRJGoNFc61MsgJYMdT0/5Js3mKz7waeA1w34rFJkiSpA3MGQGAf4H8NvX4S8L1bbPMt4ArgLSMclyRJkjoyZwCsqvcA7wFI8gngFVV1+zgGJkmSpG5s6wzgf6uq53U5EEmSJI3HdgdAgCSPB14ILGFw79+wqqrXjmpgkiRJ6sZ2B8AkxwHnA7sBmxjc+zesAAOgJEnShJvPGcDfYzDZ4xeq6t6OxiNJkqSOzScAHgD8quFPkiRp5zafD4L+R+ApXQ1EkiRJ4zGfM4CvBt6f5N+AtcDXttygqu4b1cAkSZLUjfkEwJva8/sYTPiYyW47NhxJkiR1bT4B8GXMHvwkSZK0k5jPB0Gf0+E4JEmSNCbzmQQiSZKkXcB8Pgh6M9u4BFxV++zwiCRJktSp+dwD+C62DoB7Ac8H9gDOHtWgJEmS1J353AN42kztSQJcCDwwojFJkiSpQzt8D2BVFfBe4OQdH44kSZK6NqpJIE8Edt+eDZOsTrIpyS1DbXslWZvkjva8Z2tPkncmWZ/kpiSHDL1nZdv+jiQrh9qfkeTm9p53tjOUkiRJauYzCeRXZmjeHfgB4OeBv9rOrs4B/hQ4b6jtFODKqjo9ySnt9WuBo4Gl7fFM4EzgmUn2Ak4FljG4L/H6JGuq6l/bNquAa4BLgOXApdu7n5IkSbu6+UwC+dMZ2u4HpoA/A964PZ1U1aeSHLhF8wrguW35XOCTDALgCuC8dpn5miSPTbJf23ZtVd0LkGQtsDzJJ4E9qurq1n4e8CIMgJIkSf9tPpNAuvzMwH2rakOrsyHJ9MfJ7A/cPbTdVGubq31qhvatJFnF4EwhT3jCE0awC5IkSTuHSf8g6Jnu36uH0L51Y9VZVbWsqpYtXrx4B4YoSZK0c5lXAEzyxCRntkkWX27Pf5bkiTs4jo3t0i7teVNrnwIOGNpuCXDPNtqXzNAuSZKkZrsDYJJnADcCPwNcx2ASx3Xt9WeGZ+g+BGuA6Zm8K4GLh9pPbLOBDwO+3i4VXw4cmWTPNmP4SODytu6bSQ5rs39PHOpLkiRJzG8SyBnAZ4Cjq+q+6cYkj2Qw2/YMBt8KMqck5zOYxPG4JFMMZvOeDlyY5CTgS8CL2+aXAMcA64H7gF8EqKp7k7yZQQAFeNP0hBDgFQxmGj+CweQPJ4BIkiQNmU8APBR4yXD4A6iq+5KcAXxwezqpqhNmWXXEDNsW8MpZ+lkNrJ6hfR3wtO0ZiyRJUh/N5x7A/wD2nmXdXsB/7vhwJEmS1LX5BMC/AU5P8uzhxvb694GPjnJgkiRJ6sZ8LgG/msGEiquSbAY2AvsA+wL/ALxm9MOTJEnSqM3ng6C/Cjw7yXLgR4D9gA3AtVV1RUfjkyRJ0ojNeQk4yd5JLkpy1HRbVV1WVW+uql+pqjcPNstFQ9/eIUmSpAm2rXsAXwU8EZjrDN8VwEF4CViSJGmnsK0A+BLg3e3jWGbU1v05sGKUA5MkSVI3thUAvw/43Hb0cxtw4A6PRpIkSZ3bVgD8D2CP7ejn0W1bSZIkTbhtBcAbgGO3o58VbVtJkiRNuG0FwHcBJyVZOdsGSU5k8B29fzrKgUmSJKkbc34OYFV9OMmfAO9LcjJwGfAloIAnAEcBy4B3VNVHuh6sJEmSdtw2Pwi6ql6T5JMMPhLmN4GHt1X3M/gGkBVV9bHORihJkqSR2q5vAqmqjwIfTbII2Ls1f7WqHuhsZJIkSerEfL4LmBb4NnY0FkmSJI3BtiaBSJIkaRdjAJQkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUMxMRAJM8JcmNQ49vJHlVktOSfHmo/Zih97wuyfokn09y1FD78ta2PskpC7NHkiRJk2vRQg8AoKo+DxwMkGQ34MvAR4BfBN5RVWcMb5/kqcDxwA8Cjwc+nuTJbfW7gBcAU8B1SdZU1efGsiOSJEk7gYkIgFs4Arizqv45yWzbrAAuqKr7gS8mWQ8c2tatr6ovACS5oG1rAJQkSWom4hLwFo4Hzh96fXKSm5KsTrJna9sfuHtom6nWNlu7JEmSmokKgEl2B44F/qo1nQk8icHl4Q3A26Y3neHtNUf7TLVWJVmXZN3mzZt3aNySJEk7k4kKgMDRwA1VtRGgqjZW1YNV9W3gPfzPZd4p4ICh9y0B7pmjfStVdVZVLauqZYsXLx7xbkiSJE2uSQuAJzB0+TfJfkPrjgNuactrgOOTPDzJQcBS4NPAdcDSJAe1s4nHt20lSZLUTMwkkCSPZDB795eHmv8wycEMLuPeNb2uqm5NciGDyR0PAK+sqgdbPycDlwO7Aaur6tax7YQkSdJOYGICYFXdB+y9RdtL59j+rcBbZ2i/BLhk5AOUJEnaRUzaJWBJkiR1zAAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcmKgAmuSvJzUluTLKute2VZG2SO9rznq09Sd6ZZH2Sm5IcMtTPyrb9HUlWLtT+SJIkTaKJCoDN86rq4Kpa1l6fAlxZVUuBK9trgKOBpe2xCjgTBoEROBV4JnAocOp0aJQkSdJkBsAtrQDObcvnAi8aaj+vBq4BHptkP+AoYG1V3VtV/wqsBZaPe9CSJEmTatICYAFXJLk+yarWtm9VbQBoz/u09v2Bu4feO9XaZmv/DklWJVmXZN3mzZtHvBuSJEmTa9FCD2ALz6qqe5LsA6xNcvsc22aGtpqj/Tsbqs4CzgJYtmzZVuslSZJ2VRN1BrCq7mnPm4CPMLiHb2O7tEt73tQ2nwIOGHr7EuCeOdolSZLEBAXAJI9K8pjpZeBI4BZgDTA9k3clcHFbXgOc2GYDHwZ8vV0ivhw4MsmebfLHka1NkiRJTNYl4H2BjySBwbg+UFWXJbkOuDDJScCXgBe37S8BjgHWA/cBvwhQVfcmeTNwXdvuTVV17/h2Q5IkabJNTACsqi8APzxD+1eBI2ZoL+CVs/S1Glg96jFKkiTtCibmErAkSZLGwwAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPLFroAUiStt+V572tk36POPE1nfQraTJ5BlCSJKlnDICSJEk9YwCUJEnqGQOgJElSzxgAJUmSesYAKEmS1DMGQEmSpJ4xAEqSJPWMAVCSJKlnDICSJEk9YwCUJEnqGQOgJElSzyxa6AEAJDkAOA/4XuDbwFlV9SdJTgP+L7C5bfr6qrqkved1wEnAg8CvVdXlrX058CfAbsB7q+r0ce6L4IZ1a0be5yHLjh15n5Ik9dVEBEDgAeA1VXVDkscA1ydZ29a9o6rOGN44yVOB44EfBB4PfDzJk9vqdwEvAKaA65KsqarPjWUvJEmSdgITEQCragOwoS1/M8ltwP5zvGUFcEFV3Q98Mcl64NC2bn1VfQEgyQVtWwOgJElSM3H3ACY5EHg6cG1rOjnJTUlWJ9mzte0P3D30tqnWNlv7THVWJVmXZN3mzZtn2kSSJGmXNFEBMMmjgYuAV1XVN4AzgScBBzM4Q/i26U1neHvN0b51Y9VZVbWsqpYtXrx4h8cuSZK0s5iIS8AASR7GIPy9v6o+DFBVG4fWvwf4WHs5BRww9PYlwD1tebZ2SZIkMSFnAJMEOBu4rarePtS+39BmxwG3tOU1wPFJHp7kIGAp8GngOmBpkoOS7M5gosjop6RKkiTtxCblDOCzgJcCNye5sbW9HjghycEMLuPeBfwyQFXdmuRCBpM7HgBeWVUPAiQ5GbicwcfArK6qW8e5I5IkSZNuIgJgVf09M9+/d8kc73kr8NYZ2i+Z632SJEl9NxGXgCVJkjQ+BkBJkqSeMQBKkiT1jAFQkiSpZwyAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknpmIj4IWt377I2XjrzPHz746JH3KUmSuucZQEmSpJ4xAEqSJPWMAVCSJKlnvAdQkjQRLn/374+8z6Ne/rqR9yntCgyAkiRJC+D9v3VKJ/3+/B+dvs1tvAQsSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUM34TiHZq1/3jhzrp90d+7Gc76VeSpEngGUBJkqSeMQBKkiT1jAFQkiSpZ7wHUJI0qyvO/sOR93nkSb898j4lzY9nACVJknrGAChJktQzBkBJkqSe8R5ASdpBnzj/jzvp93knvKqTfiXJM4CSJEk9YwCUJEnqGS8BSxPo6o+f20m/P/oTK7dq+4dLz+6k1rOOPqmTfrfXVRf9aSf9/vjPnNxJv5I0TgZASZJ2ERefflon/a44pZt+tXB2yQCYZDnwJ8BuwHur6vQFHtKMbvnc3468z6c99fkj71Malb9b8+5O+n3OsS/vpF9J/fPB17++k35/7vd+r5N+H6pdLgAm2Q14F/ACYAq4Lsmaqvrcwo5MktRHH3v7m0be5wtf/bsj71P9sssFQOBQYH1VfQEgyQXACsAAqB1y7afO76TfZx5+Qif9SpI0m10xAO4P3D30egp45gKNRZI0gS5551tG3ucxv/Y7I+9zkn3kLd2chTzud7Y+Y/qh3+3mz/Zn3zT6n4OdRapqoccwUkleDBxVVb/UXr8UOLSqfnWL7VYBq9rLpwCfn2epxwFf2cHhTmKtcdezlrWsZa1JrjXuetay1qhrfaWqlm/ZuCueAZwCDhh6vQS4Z8uNquos4KyHWiTJuqpa9lDfP6m1xl3PWtaylrUmuda461nLWuOqtSt+EPR1wNIkByXZHTgeWLPAY5IkSZoYu9wZwKp6IMnJwOUMPgZmdVXdusDDkiRJmhi7XAAEqKpLgEs6LvOQLx9PeK1x17OWtaxlrUmuNe561rLWWGrtcpNAJEmSNLdd8R5ASZIkzcEAOE9JVifZlOSWMdT67iSfTvLZJLcmeeMYau6W5DNJPtZxnbuS3JzkxiTrOq712CQfSnJ7ktuS/GhHdZ7S9mf68Y0kr+qiVqv3G+3n4pYk5yf57g5r/Xqrc2sX+zTTv6skeyVZm+SO9rxnh7Ve3Pbt20lGNstullp/1H4Wb0rykSSP7bDWm1udG5NckeTxXdUaWvebSSrJ47qqleS0JF8e+rd2TFe1WvuvJvl8+xn5w65qJfng0D7dleTGUdSao97BSa6ZPg4nObTDWj+c5Op23P9okj1GUOeAJJ9ox/Vbk/x6ax/5sWOOWiM/dsxRa7THjqryMY8HcDhwCHDLGGoFeHRbfhhwLXBYxzVfDXwA+FjHde4CHjemv7NzgV9qy7sDjx1Dzd2AfwG+r6P+9we+CDyivb4Q+IWOaj0NuAV4JIP7hj8OLB1xja3+XQF/CJzSlk8B/qDDWj/A4PNAPwks63i/jgQWteU/6Hi/9hha/jXg3V3Vau0HMJiA98+j+vc9y36dBvzmKH8G56j1vPYz//D2ep8u/wyH1r8N+N2O9+0K4Oi2fAzwyQ5rXQf8eFt+GfDmEdTZDzikLT8G+CfgqV0cO+aoNfJjxxy1Rnrs8AzgPFXVp4B7x1Srqurf2suHtUdnN20mWQL8JPDermqMW/st83DgbICq+lZVfW0MpY8A7qyqf+6wxiLgEUkWMQhnW33e5Yj8AHBNVd1XVQ8AVwHHjbLALP+uVjAI77TnF3VVq6puq6r5fhj8Q611RftzBLiGwWeVdlXrG0MvH8WIjh9zHAffAfz2qOpso9bIzVLrFcDpVXV/22ZTh7UASBLgJcDIvn9ylnoFTJ+J+x5GdAyZpdZTgE+15bXAz4ygzoaquqEtfxO4jcEvxyM/dsxWq4tjxxy1RnrsMABOuAwuyd4IbALWVtW1HZb7YwYH7293WGNaAVckuT6Db2XpyhOBzcD7Mri0/d4kj+qw3rTjGeHBe0tV9WXgDOBLwAbg61V1RUflbgEOT7J3kkcyOFNwwDbeMwr7VtUGGBwQgX3GUHPcXgZc2mWBJG9Ncjfw80A33901qHMs8OWq+mxXNbZwcrsUtnpUtwfM4snAc5Jcm+SqJD/SYa1pzwE2VtUdHdd5FfBH7efjDOB1Hda6BTi2Lb+YER9DkhwIPJ3BlbJOjx1b1OrUHLV2+NhhAJxwVfVgVR3MIOkfmuRpXdRJ8kJgU1Vd30X/M3hWVR0CHA28MsnhHdVZxOBSxJlV9XTg3xlcEuhMBh9AfizwVx3W2JPBb7kHAY8HHpXk/3RRq6puY3C5YS1wGfBZ4IE536RtSvIGBn+O7++yTlW9oaoOaHVO7qJG+8XgDXQYMLdwJvAk4GAGvwC9rcNai4A9gcOA3wIubGfounQCHf4COeQVwG+0n4/foF0p6cjLGBzrr2dwWfNbo+o4yaOBi4BXbXHWe+Qmodaojh0GwJ1Eu2z5SWCr7/MbkWcBxya5C7gAeH6Sv+yoFlV1T3veBHwEGMnNx3lxr6YAAAXxSURBVDOYAqaGzpx+iEEg7NLRwA1VtbHDGj8BfLGqNlfVfwEfBn6sq2JVdXZVHVJVhzO4tNP1mQmAjUn2A2jPI7n0NgmSrAReCPx8tRt6xuADjOCy2yyexOCXkc+2Y8gS4IYk39tFsara2H45/jbwHro7fsDgGPLhdkvOpxlcIRnJBJeZtFs6fhr4YFc1hqxkcOyAwS+snf05VtXtVXVkVT2DQbi9cxT9JnkYg5D0/qqa3pdOjh2z1OrEbLVGeewwAE6wJIunZ/kkeQSD//Rv76JWVb2uqpZU1YEMLl/+bVV1ckYpyaOSPGZ6mcGNrZ3Mqq6qfwHuTvKU1nQE8Lkuag0Zx2/vXwIOS/LIdjbiCAb3iXQiyT7t+QkM/nMax9mJNQz+g6I9XzyGmp1Lshx4LXBsVd3Xca2lQy+Ppbvjx81VtU9VHdiOIVMMbmL/ly7qTf/n3hxHR8eP5q+B57e6T2YwkewrHdb7CeD2qprqsMa0e4Afb8vPp8Nf7IaOId8F/A7w7hH0GQZnLW+rqrcPrRr5sWOOWiM3W62RHzt2ZAZJHx8M/uPbAPwXg4PcSR3W+iHgM8BNDA5wI5sRto26z6XDWcAM7sv7bHvcCryh4/05GFjX/hz/Gtizw1qPBL4KfM8Y/p7eyOA/9FuAv6DNUuyo1t8xCM6fBY7ooP+t/l0BewNXMvhP6Upgrw5rHdeW7wc2Apd3WGs9cDdwY3uMambuTLUuaj8fNwEfZXAjeSe1tlh/F6ObBTzTfv0FcHPbrzXAfh3W2h34y/bneAPw/C7/DIFzgJePosZ27Nuzgevbv+trgWd0WOvXGcxm/SfgdNoXUexgnWczuJ/8pqF/T8d0ceyYo9bIjx1z1BrpscNvApEkSeoZLwFLkiT1jAFQkiSpZwyAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknrGAChJM0jy00n+NsnXktyf5J+SvCXJ45IcmKTaVyhOb//bSZ7bwThOS/KVodfTtacf/57kziTvT/KcUdeXtGsyAErSFpK8jcFXY30BeCmDb6t5B/BTDL56bAPwo8DfD73ttxl8iPq4/GYbwzHAmxl8+O2nkpw6xjFI2kktWugBSNIkSfJTwKsZfDPD6qFVVyU5Cziyqu4HrlmQAf6Pz1fV9BiuAs5J8ibgtCRXVdUnF25okiadZwAl6Tv9BnDDFuEPgKp6sKou3fIScJK7GJyBO3Xo0uxzk/xVkk9s2U+SNybZ2L7wfZTeyOD7XV8+4n4l7WIMgJLUtED2Y8Bl83zrccDXGXyB+4+2xw3Ae4EfT3LQUI0AJwJ/WVX/NYpxT6uqB4G/BQ4bZb+Sdj0GQEn6H3sDDwe+NJ83VdVngAeAqaq6pj2+Aaxl8OXtvzC0+fOAA4H3jWLAM5gC9u2ob0m7CAOgJG2tRtJJ1beBc4AT25k/GITBdVV1yyhqzCDb3kRS3xkAJel/fBW4H3jCCPt8H/B9wPOSPAb4GWCr+wtHaH9gY4f9S9oFGAAlqWn35P0DcNQI+7wL+DiDM38vYXDcPX9U/Q9Lsgh4PnB1F/1L2nUYACXpO/0xsCzJyi1XJPmuJMtned+3gO+eZd3ZDM78/Qrw11X1tZGMdGu/CzweeHdH/UvaRfg5gJI0pKo+muTtwNlJngVcDPwb8P0MPl7lLgYfFbOl24GfTHJZ2/7zVfXNtu6vgT8DDgFeN6KhPqV9Q8juwEHA8cBy4LSqumpENSTtogyAkrSFqnpNkn8ETgY+ADyCQfBbA5zBzGf6fgt4F/A3wCMZzPb9ZOvv/iSXAoczuBw8Cme05/9k8M0kVwOHV9Xfjah/SbuwVI1kspskaRbt3rx/BlZX1f9b6PFIkmcAJakjSXYHfhj43ww+Y/DPF3ZEkjRgAJSk7jwe+DSwCfjlqpoaXpnku5hjMl5VPdDt8CT1lZeAJWmBJDkH2Gq28ZCD2sfISNJIGQAlaYEkORB43Byb3FRV3xrPaCT1iQFQkiSpZ/wgaEmSpJ4xAEqSJPWMAVCSJKlnDICSJEk98/8B0tdGgwLgw4cAAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<Figure size 648x432 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "city = member.groupby(['city']).size()\n",
+    "city = pd.DataFrame(city)\n",
+    "city.reset_index(level=0, inplace=True)\n",
+    "city.columns = ['City_ID', 'Count']\n",
+    "\n",
+    "plt.rcParams[\"axes.labelsize\"] = 15\n",
+    "ax = sns.catplot(x='City_ID', y='Count', kind='bar', palette=\"ch:3,0.6,dark=.5\",\n",
+    "                 data=city, height=6, aspect=1.5)## 3. Train Data Exploration\n",
+    "\n",
+    "train = pd.read_csv('train.csv')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Distribution of Gender"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 648x432 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "member['gender'] = member.gender.replace(np.NaN, 'NA')\n",
+    "plt.rcParams[\"axes.labelsize\"] = 15\n",
+    "ax = sns.catplot(x='gender', kind='count', palette=\"ch:17,0.6,dark=.5\",\n",
+    "                 data=member, height=6, aspect=1.5)\n",
+    "ax.fig.subplots_adjust(top=.9)\n",
+    "ax.fig.suptitle('Distribution of Gender', fontsize=15)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Songs Data Exploration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "songs = pd.read_csv('songs.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summarize(songs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "count_nan(songs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "songs.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for n in songs.genre_ids.values:\n",
+    "    if len(str(n)) > 5:\n",
+    "        print(n)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAoAAAAGqCAYAAACbEvXuAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3de7gkVX3u8e8bJly8IBcHLwwKyMRoNFEckURPNKIwqGEwRz0kJoBy5IiSeI1gTISoyfGOMUGUCAKJCmg0okFxghCPRoHxwk00jIIygjAwgCgKQX/nj1pbm57ee/Ye9r2+n+fpZ3etWlW1VndN9ztVtapTVUiSJKk/fmWuGyBJkqTZZQCUJEnqGQOgJElSzxgAJUmSesYAKEmS1DMGQEmSpJ4xAGrRSHJskmqPnye5OclFSf4myQOH6u7a6j1rkuvesq3/MVNoz9VJ3j4wfUqSNZPv0YTr3jfJy0eUT9s2plOSFyW5KsldSc4fp85T2nvyqFluXm8kuXeS05Pc1F7rQ8epd3iSA0eU322flrRwLZnrBkjT7FZgZXt+P2BP4Ajg8CQrq+orbd51wG8D35zkercEjgGuBr4+yWWeDdw0ybpTtS/wHOBdQ+VvBLaZoW1ulha+TwD+AfgIcPPctqjXjgB+HzgY+D7w7XHqHQ5cBvzrLLVL0iwzAGqxuauqvjwwfU6SE4DPA2ckeXhV/ayq7gC+PHoV90ySbarqJ1X1tZlY/0Sqarwv9Lm0B7AFcHJVXTLXjem5Xwe+VVX/MtcNkTS3PAWsRa+qbgFeAzwMeDqMPgWc5IAkX0ny43b6+IIkT26zb2t/PzBwmnnXgfU8P8lpSW4BPtnWN/J0WZIDk3wzyU+TfCHJIwfmjTw1PXhqN8mxwKuAhw605ZThegPLPibJuUlub/36YJIHjNjm85K8L8mtSdYl+eskm/yMSHJkkiuT3JFkbZJXDMw7Fvh/bfLiiU47TkaSV7XT+rcmuT7JJ5PsMVTn/CQfTfJHrT0/TPLpJMuG6j2klf+knZ4+tC13/kCdUa/nqH1nMu1KkjcmuaG16eQkB43tSwP1tk7y1iTXtNf04iTPmMRrc/8kp7bTu7e312HFwPyrgcOAx47tN+Os53zgccAhA/vXoUN1XtH2kZvTnVLebmj+Dm1fur7t5/+Z5AmbaP+vJnl7ku+1fl+b5ONJthyoM237cpLntv32J0nOSzL2uhw6UGeizwRpQTMAqi/OA+4C9h41M8nDgI8Cn6M7RfZ84FPADq3KU9vfN9GdOv5tutPIY95OFxKfC/ztBO14KPBOulO1f0R3mvqcJFtPoS/vBz4E/GCgLW8cp19LgfOBe7Xt/SnwZGD14Bdr81bgR3Snlv8ZeH17Pq4kLwL+HjiL7nX7CPCOJEcPtPWl7fnzW1v/bXLdHGkZ3ankVcCL6I4sfjHJ/YbqPQE4ki4oH053KcCJA+1Oa/MjgBcCrwT+rC03U+16OfAXwHvpXtef0L3mwz4KHEq3H/0+cBFwVjZ9/em/AvsBrwb+F93n+3kDQfTZwNl0lz2M7TejvKTVOXug3uB79jxgH7rX9SjgWQzs80m2Av6d7j9bfw4cCKwH/j1D1+IOeS3dPvJXbdmX013SsUVb77Ttyy0Ynw58tb0uZwFnDK5gEp8J0sJWVT58LIoHcCxw4wTzrwNOaM93BQp4Vpt+DnDTBMvep9U/dKh8bD0fH7HM1cDbB6ZPaXV/Z6DsoXTB9MWj2jW07JqB6bcDV4/Y5nC9NwO3ANsOlO3VtvGHQ9s8bWhdXwdOn+A1+RW668g+MFT+Hrov7q3b9FPa+h+1ifdvUvUG6m9Bd73jbcDBA+Xnt+1vP1D28rbubdr0M9v0XgN1dgb+Gzh/vNdzovdoona1suuA44fqnt3WtWub3qdNP3mo3ueBj0zwWqwcXg64N13wet9E/RlnfWuAU8bZp78NLBkoexfwg4Hpw4A7geUDZUvacm+bYJufAt4xwfxp25fp/qNyGZCBstcw8G+cTXwm+PCx0B8eAVSfZIJ5lwL3a6fQ9k1y7ymue7JHtW6oqv8cm6iq7wJfofsimwl7AZ+tqh8ObPNCui/yJw3V/ezQ9DfojmyNZxnwYLov00FnANsCj96M9k4oyd5JVie5iS44304Xzn9tqOpFVTU42OQb7e/O7e/j6ULLhWMVqur7dO/FTLRrF+CBdEeaBg1PP43uyO4XkywZewDnAisY317A+qr6j4H+/JguVA2/z/fUeVV118D0N4CdBo7CPY3udbxqoP0A/8HEffg6cGiS1yT5zXaUdtB07suPBz5ZVYOnwYffi3v6mSDNawZA9UI7xbojcP2o+VX1LbrTd7vTHZW5McmH2mmnyRi53hFuGKfsQZNcfqoexOi2Xc/Gp7JuGZq+E5jo1PRYm4fXPzY9rafKkjyE7os9wP8Bnkj3RX4DG7dzVF8YqPdAuqNjw0aVTUe7xk59Dq9/ePr+re5/Dz2OpQuR45nK+3xPjXptQzdSHro+7M3GfXgBE/fhTcDxdKegLwauSfKygfnTuS+Pev/vNj0NnwnSvOYoYPXF79Ht718ar0JV/Rvwb+26rWfSndr6e+CgSax/5AX1I+w0Ttnl7flP29/ha5o290v8unG2+QA282jX0LoZsf6xi/I33MP1D1tJd/3XqnZ0i3Z0aXNemx8Ao77Il/LL94D2fFPvxWTa9YOB9Q9vb9AGutPqG92DbxMmep+n+33YlA10p5CPGDHvjvEWqqqf0l2r9/oky4EXA+9K8q2q+gzTuy+Pev832h/u4WeCNK95BFCLXhuh+BZgLd3F6ROqqlur6kPAx4GxEbrDR5A2105JfmegbQ+hG6AwdiryBrqjJY8YqHMfNr5gf1NH58ZcAOyX5L4D63s83bVSX9iM9g9aB1xLN/Bl0POAH9KdQptO2wA/pzvFOritzfmP7EXAA5P84tR7kp3pRr8OWgfsOjRI5+mb0a5r6ELHqqFlDxiaPpfu6NSPqmrN8GOC/lxAt2/97kB/7kUXWjbnfZ7s/jXKuXS3/vneiD5Map+oqivpBrPcwS//DU7nvnwR8PtDp5mH34vB9oz6TJAWNI8AarFZkmRspO996b7Qj6A7QrOyqn42aqEk/4cuZH2GLtQspws2pwFU1Z1JrgKel+QyuiNDm3NPuxuBf0ryV3SjQN9AF/pOadv5eZJPAK9I8l26U1mvanUHfRN4QLtlxWV0g1+uHrG9d7b+n5PkLXTXpb2ZLpzdo3vBtbYeC7yvXfu2mm5U5hHAX7QjOpvj6Ul+fajsG3SjMbeguxXPScBv0IWE4dN9k3E23WnGM5O8lu71PYbudOLPB+r9K9179P50t9p5LN2pzEGbbFdV/SzJ24C3JVkPfJEucIxdJzm2zdXAOXQjW99Cd2R4W+AxdINqXjuqM1V1TpIv0t3r8mi6G5C/mi6cvm0Kr8uYb9KFrf3auq6qqsne1Pw0uqN356e7DdJ36C6/2IvuusvjRi2U5ON0R/K+Rvd+PIfuO+rzrcp07stvoQuUpyf5AN1/uF7U5v28tWfCzwRpwZvrUSg+fEzXg+46qWqPn9N9Aa8B/gZ44FDdXbn7KOCxW11cSxfurqL7kthqYJl96ULfT9uyuw6vZ2gbV7PxKOA1wB8A/0V3dOOLDI16pTul9Qm6o2jfpbvdxincfXTv1sAH6MJj0UZsDtdrZY+lCym3t9fkQ8ADxnsthts7idf9SLqjq3fSfdm/Ymj+U5jaKOBRj2NbnYPpRpP+hO5G3k8Y8TqfD3x0U22gG4H9mfZ+jr3OnwX+dWjZQ9s2b6cbVPE7w6/XJNsVutv1rKcbIfxBukBTwHYD9bYC/nrgNf1Ba+czN/H6LaULJze3dvwH8PjNfE93pztafit3Hxl7tz4NvD4F3Geg7H7A39Ed+byT7kjqx4AnTrDNP6f793Fre30uoDutPiP7Mt1R2rXt/f8C3eCVAg6c7GeCDx8L+ZGqyV66JEmLV7vO6zvAP1TVMbO0zfcDT6+qh87G9jS+JH8M/BOwe1VdNdftkWaap4Al9VKSF9MdKb6S7ujZK+mOvp08Q9t7FN0Nmv+zbXd/utPJR83E9jSxdD8RuZruiOmewF8C/2b4U18YACX11R104eshdKf+LgSeVt29GWfCj+nuV3ck3U2av9u2/44Z2p4mtiPdTct3pLvO8Qy6m0FLveApYEmSpJ7xNjCSJEk9YwCUJEnqGQOgJElSzxgAJUmSesYAKEmS1DMGQEmSpJ4xAEqSJPWMAVCSJKlnDICSJEk9YwCUJEnqGQOgJElSzxgAJUmSesYAKEmS1DMGQEmSpJ4xAEqSJPWMAVCSJKlnZjUAJjk5yQ1JLhsx79VJKsn923SSvDvJ2iSXJNlzoO4hSa5sj0MGyh+X5NK2zLuTpJXvkGR1q786yfab2oYkSdJiNdtHAE8BVg4XJtkFeDrwvYHi/YHl7XE4cEKruwNwDPAEYC/gmLFA1+ocPrDc2LaOBs6tquXAuW163G1IkiQtZktmc2NV9fkku46YdRzwGuATA2WrgNOqqoAvJ9kuyYOApwCrq2oDQJLVwMok5wPbVtWXWvlpwIHAp9u6ntLWeypwPnDUeNuoqusm6sfKlSvrM5/5zNQ6L0mSNPsyqnBWA+AoSQ4Avl9VF7cztmN2Bq4ZmF7XyiYqXzeiHOABY6Guqq5LstMmtrFRAExyON1RQh7ykIdMoYeSJEnzy5wOAklyL+B1wOtHzR5RVptRPmETJrtMVZ1YVSuqasXSpUs3sVpJkqT5a65HAT8M2A24OMnVwDLgq0keSHc0bpeBusuAazdRvmxEOcD17fQx7e8NrXy8dUmSJC1acxoAq+rSqtqpqnatql3pAtmeVfUD4Czg4DZSd2/g1nYa9xxg3yTbt8Ef+wLntHm3Jdm7jf49mF9eU3gWMDZa+JCh8lHbkCRJWrRm9RrAJB+mG4xx/yTrgGOq6qRxqp8NPANYC9wOvACgqjYkeSNwUav3hrEBIcARdCONt6Eb/PHpVv5m4Mwkh9GNNH7uRNuQJElazNINgNVUrFixotasWTPXzZAkSdqUkaOA5/oaQEmSJM0yA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeqZWf0puMXupkvXznUTpmTHR+8x102QJElzwCOAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknrGAChJktQzBkBJkqSeMQBKkiT1jAFQkiSpZwyAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknrGAChJktQzBkBJkqSeMQBKkiT1jAFQkiSpZwyAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknrGAChJktQzBkBJkqSeMQBKkiT1jAFQkiSpZwyAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknrGAChJktQzsxoAk5yc5IYklw2UvS3JN5NckuTjSbYbmPfaJGuTfCvJfgPlK1vZ2iRHD5TvluSCJFcmOSPJlq18qza9ts3fdVPbkCRJWqxm+wjgKcDKobLVwKOq6jeB/wJeC5DkkcBBwG+0Zd6TZIskWwDHA/sDjwT+sNUFeAtwXFUtB24GDmvlhwE3V9UewHGt3rjbmO5OS5IkzSezGgCr6vPAhqGyz1bVXW3yy8Cy9nwVcHpV3VFVVwFrgb3aY21Vfaeq7gROB1YlCfBU4KNt+VOBAwfWdWp7/lFgn1Z/vG1IkiQtWvPtGsAXAp9uz3cGrhmYt66VjVe+I3DLQJgcK7/butr8W1v98da1kSSHJ1mTZM369es3q3OSJEnzwbwJgEleB9wFfHCsaES12ozyzVnXxoVVJ1bViqpasXTp0lFVJEmSFoQlc90AgCSHAM8C9qmqsQC2DthloNoy4Nr2fFT5jcB2SZa0o3yD9cfWtS7JEuB+dKeiJ9qGJEnSojTnRwCTrASOAg6oqtsHZp0FHNRG8O4GLAcuBC4ClrcRv1vSDeI4qwXH84DntOUPAT4xsK5D2vPnAJ9r9cfbhiRJ0qI1q0cAk3wYeApw/yTrgGPoRv1uBazuxmXw5ap6cVVdnuRM4Bt0p4ZfWlU/a+s5EjgH2AI4uaoub5s4Cjg9yZuArwEntfKTgH9KspbuyN9BABNtQ5IkabHKL8+4arJWrFhRa9as2aj8pkvXzkFrNt+Oj95jrpsgSZJm1qjxDnN/CliSJEmzywAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknjEASpIk9YwBUJIkqWcMgJIkST1jAJQkSeoZA6AkSVLPzGoATHJykhuSXDZQtkOS1UmubH+3b+VJ8u4ka5NckmTPgWUOafWvTHLIQPnjklzalnl3kmzuNiRJkhar2T4CeAqwcqjsaODcqloOnNumAfYHlrfH4cAJ0IU54BjgCcBewDFjga7VOXxguZWbsw1JkqTFbFYDYFV9HtgwVLwKOLU9PxU4cKD8tOp8GdguyYOA/YDVVbWhqm4GVgMr27xtq+pLVVXAaUPrmso2JEmSFq35cA3gA6rqOoD2d6dWvjNwzUC9da1sovJ1I8o3ZxsbSXJ4kjVJ1qxfv35KHZQkSZpP5kMAHE9GlNVmlG/ONjYurDqxqlZU1YqlS5duYrWSJEnz13wIgNePnXZtf29o5euAXQbqLQOu3UT5shHlm7MNSZKkRWs+BMCzgLGRvIcAnxgoP7iN1N0buLWdvj0H2DfJ9m3wx77AOW3ebUn2bqN/Dx5a11S2IUmStGgtmc2NJfkw8BTg/knW0Y3mfTNwZpLDgO8Bz23VzwaeAawFbgdeAFBVG5K8Ebio1XtDVY0NLDmCbqTxNsCn24OpbkOSJGkxSzdgVlOxYsWKWrNmzUblN126dg5as/l2fPQec90ESZI0s0aNd5gXp4AlSZI0iwyAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknrGAChJktQzBkBJkqSeMQBKkiT1jAFQkiSpZwyAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknrGAChJktQzBkBJkqSeMQBKkiT1jAFQkiSpZwyAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknrGAChJktQzBkBJkqSeMQBKkiT1jAFQkiSpZwyAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknrGAChJktQzBkBJkqSeMQBKkiT1jAFQkiSpZwyAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknrGAChJktQzBkBJkqSeMQBKkiT1zLwJgElekeTyJJcl+XCSrZPsluSCJFcmOSPJlq3uVm16bZu/68B6XtvKv5Vkv4Hyla1sbZKjB8pHbkOSJGmxmhcBMMnOwJ8BK6rqUcAWwEHAW4Djqmo5cDNwWFvkMODmqtoDOK7VI8kj23K/AawE3pNkiyRbAMcD+wOPBP6w1WWCbUiSJC1K8yIANkuAbZIsAe4FXAc8Ffhom38qcGB7vqpN0+bvkySt/PSquqOqrgLWAnu1x9qq+k5V3QmcDqxqy4y3DUmSpEVpXgTAqvo+8Hbge3TB71bgK8AtVXVXq7YO2Lk93xm4pi17V6u/42D50DLjle84wTbuJsnhSdYkWbN+/frN76wkSdIcmxcBMMn2dEfvdgMeDNyb7nTtsBpbZJx501W+cWHViVW1oqpWLF26dFQVSZKkBWFeBEDgacBVVbW+qv4b+BjwO8B27ZQwwDLg2vZ8HbALQJt/P2DDYPnQMuOV3zjBNiRJkhal+RIAvwfsneRe7bq8fYBvAOcBz2l1DgE+0Z6f1aZp8z9XVdXKD2qjhHcDlgMXAhcBy9uI3y3pBoqc1ZYZbxuSJEmL0qQDYJKDk+w4zrwdkhy8uY2oqgvoBmJ8Fbi0tetE4CjglUnW0l2vd1Jb5CRgx1b+SuDotp7LgTPpwuNngJdW1c/aNX5HAucAVwBntrpMsA1JkqRFKd1BsElUTH4G/HZVXThi3uOAC6tqi2lu37y0YsWKWrNmzUblN126dg5as/l2fPQec90ESZI0s0aNd5jSKeCRK2h2BH44peZIkiRpTiyZaGaSVXSjc8f8VZLhe6BsDfwPuuvsJEmSNM9NGACBnYBHD0w/DHjgUJ07gc8Cb5rGdkmSJGmGTBgAq+ofgX8ESHIecERVfXM2GiZJkqSZsakjgL9QVb83kw2RJEnS7Jh0AARI8mDgWXQ3TN56aHZV1VHT1TBJkiTNjEkHwCTPBj4MbAHcQHft36Ciu6eeJEmS5rGpHAH8W7rBHodW1YYZao8kSZJm2FQC4C7Anxr+JEmSFrap3Aj6P4GHz1RDJEmSNDumcgTwlcAHk/wIWA3cMlyhqm6froZJkiRpZkwlAF7S/n6AbsDHKL34LWBJkqSFbCoB8IWMH/wkSZK0QEzlRtCnzGA7JEmSNEumMghEkiRJi8BUbgS9nk2cAq6qne5xiyRJkjSjpnIN4PFsHAB3AJ4KbAucNF2NkiRJ0syZyjWAx44qTxLgTOCuaWqTJEmSZtA9vgawqgp4P3DkPW+OJEmSZtp0DQLZHdhymtYlSZKkGTSVQSAvGVG8JfAI4PnAR6arUZIkSZo5UxkE8g8jyu4A1gHvAf56WlokSZKkGTWVQSDeM1CSJGkRMNRJkiT1zJQCYJLdk5yQ5NIk329/35Nk95lqoCRJkqbXVAaBPA44D/gp8CngeuABwP8Enp/k96rqqzPSSkmSJE2bqQwCeTvwNWD/qrp9rDDJvYCz2/ynTm/zJEmSNN2mcgp4L+Ctg+EPoE2/HXjCdDZMkiRJM2MqAfAnwI7jzNuB7tSwJEmS5rmpBMB/A96c5EmDhW36/wKfnM6GSZIkaWZM5RrAVwKfAP4jyXq6QSA70Q0E+SLwqulvniRJkqbbVG4EfRPwpCQrgccDDwKuAy6oqs/OUPskSZI0zSY8BZxkxyT/kmS/sbKq+kxVvbGqXlJVb+yq5V+S7DTjrZUkSdI9tqkjgC8HdgcmOsL3WbprAF8FHDVN7dI88/0vLLxbPO78pD3nugmSJM1LmxoE8jzgvVVV41Vo894HrJrOhkmSJGlmbCoAPhT4xiTWcwWw6z1ujSRJkmbcpgLgT4BtJ7Ge+7S6kiRJmuc2FQC/ChwwifWsanUlSZI0z20qAB4PHJbkkPEqJDkYeAHwD9PZMEmSJM2MCUcBV9XHkvwd8IEkRwKfAb4HFPAQYD9gBXBcVX18phsrSZKke26TPwVXVa+iO8X7Q+DVdCN+TwT+HLgNWFVVr76nDUmyXZKPJvlmkiuS/HaSHZKsTnJl+7t9q5sk706yNsklSfYcWM8hrf6Vg0cukzwuyaVtmXcnSSsfuQ1JkqTFalK/BVxVn6yqfYD70v0CyIOA+1bV06rqU9PUlr8DPlNVvw78Ft3I4qOBc6tqOXBumwbYH1jeHocDJ0AX5oBjgCcAewHHDAS6E1rdseVWtvLxtiFJkrQoTSoAjqmqu6rq+va4a7oakWRb4HeBk9p27qyqW+iOPJ7aqp0KHNierwJOq86Xge2SPIjulPTqqtpQVTcDq4GVbd62VfWldt/C04bWNWobkiRJi9KUAuAM2h1YT3et4deSvD/JvYEHVNV1AO3v2M/N7QxcM7D8ulY2Ufm6EeVMsI27SXJ4kjVJ1qxfv37zeypJkjTH5ksAXALsCZxQVY8FfszEp2Izoqw2o3zSqurEqlpRVSuWLl06lUUlSZLmlfkSANcB66rqgjb9UbpAeH07fUv7e8NA/V0Gll8GXLuJ8mUjyplgG5IkSYvSvAiAVfUD4JokD29F+9D9BN1ZwNhI3kOAT7TnZwEHt9HAewO3ttO35wD7Jtm+Df7YFzinzbstyd5t9O/BQ+satQ1JkqRFacL7AM6yPwU+mGRL4Dt0N5f+FeDMJIfR3X/wua3u2cAzgLXA7a0uVbUhyRuBi1q9N1TVhvb8COAUYBvg0+0B8OZxtiFJkrQozZsAWFVfp7up9LB9RtQt4KXjrOdk4OQR5WuAR40ov2nUNiRJkhareXEKWJIkSbPHAChJktQzBkBJkqSeMQBKkiT1jAFQkiSpZwyAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknrGAChJktQzBkBJkqSeMQBKkiT1jAFQkiSpZwyAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknrGAChJktQzBkBJkqSeMQBKkiT1jAFQkiSpZwyAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknrGAChJktQzBkBJkqSeMQBKkiT1jAFQkiSpZwyAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknrGAChJktQzBkBJkqSeMQBKkiT1jAFQkiSpZwyAkiRJPWMAlCRJ6pl5FQCTbJHka0k+1aZ3S3JBkiuTnJFky1a+VZte2+bvOrCO17bybyXZb6B8ZStbm+TogfKR25AkSVqs5lUABF4GXDEw/RbguKpaDtwMHNbKDwNurqo9gONaPZI8EjgI+A1gJfCeFiq3AI4H9gceCfxhqzvRNiRJkhaleRMAkywDngm8v00HeCrw0VblVODA9nxVm6bN36fVXwWcXlV3VNVVwFpgr/ZYW1Xfqao7gdOBVZvYhiRJ0qI0bwIg8C7gNcDP2/SOwC1VdVebXgfs3J7vDFwD0Obf2ur/onxomfHKJ9rG3SQ5PMmaJGvWr1+/uX2UJEmac/MiACZ5FnBDVX1lsHhE1drEvOkq37iw6sSqWlFVK5YuXTqqiiRJ0oKwZK4b0DwROCDJM4CtgW3pjghul2RJO0K3DLi21V8H7AKsS7IEuB+wYaB8zOAyo8pvnGAbkiRJi9K8OAJYVa+tqmVVtSvdII7PVdXzgfOA57RqhwCfaM/PatO0+Z+rqmrlB7VRwrsBy4ELgYuA5W3E75ZtG2e1ZcbbhhkldqcAABD0SURBVCRJ0qI0LwLgBI4CXplkLd31eie18pOAHVv5K4GjAarqcuBM4BvAZ4CXVtXP2tG9I4Fz6EYZn9nqTrQNSZKkRSndQTBNxYoVK2rNmjUbld906do5aM3m2/HRe0y67ve/8NUZbMnM2PlJe851EyRJmmujxjvM+yOAkiRJmmYGQEmSpJ4xAEqSJPWMAVCSJKlnDICSJEk9YwCUJEnqGQOgJElSzxgAJUmSesYAKEmS1DMGQEmSpJ4xAEqSJPWMAVCSJKlnDICSJEk9YwCUJEnqGQOgJElSzxgAJUmSesYAKEmS1DMGQEmSpJ4xAEqSJPWMAVCSJKlnDICSJEk9YwCUJEnqGQOgJElSzxgAJUmSesYAKEmS1DMGQEmSpJ4xAEqSJPWMAVCSJKlnDICSJEk9YwCUJEnqGQOgJElSzxgAJUmSesYAKEmS1DMGQEmSpJ4xAEqSJPWMAVCSJKlnDICSJEk9YwCUJEnqGQOgJElSzxgAJUmSemZeBMAkuyQ5L8kVSS5P8rJWvkOS1UmubH+3b+VJ8u4ka5NckmTPgXUd0upfmeSQgfLHJbm0LfPuJJloG5IkSYvVvAiAwF3Aq6rqEcDewEuTPBI4Gji3qpYD57ZpgP2B5e1xOHACdGEOOAZ4ArAXcMxAoDuh1R1bbmUrH28bkiRJi9K8CIBVdV1VfbU9vw24AtgZWAWc2qqdChzYnq8CTqvOl4HtkjwI2A9YXVUbqupmYDWwss3btqq+VFUFnDa0rlHbkCRJWpTmRQAclGRX4LHABcADquo66EIisFOrtjNwzcBi61rZROXrRpQzwTaG23V4kjVJ1qxfv35zuydJkjTn5lUATHIf4F+Al1fVDyeqOqKsNqN80qrqxKpaUVUrli5dOpVFJUmS5pV5EwCT/Cpd+PtgVX2sFV/fTt/S/t7QytcBuwwsvgy4dhPly0aUT7QNSZKkRWleBMA2Ivck4IqqeufArLOAsZG8hwCfGCg/uI0G3hu4tZ2+PQfYN8n2bfDHvsA5bd5tSfZu2zp4aF2jtiFJkrQoLZnrBjRPBP4EuDTJ11vZXwBvBs5MchjwPeC5bd7ZwDOAtcDtwAsAqmpDkjcCF7V6b6iqDe35EcApwDbAp9uDCbYhSZK0KM2LAFhVX2D0dXoA+4yoX8BLx1nXycDJI8rXAI8aUX7TqG1IkiQtVvPiFLAkSZJmjwFQkiSpZwyAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknrGAChJktQzBkBJkqSeMQBKkiT1jAFQkiSpZwyAkiRJPWMAlCRJ6hkDoCRJUs8YACVJknrGAChJktQzBkBJkqSeMQBKkiT1zJK5boA0H3z77M/PdROm7GHP+N25boIkaYHyCKAkSVLPGAAlSZJ6xlPAUg9cevqn5roJU/Log541102QpEXNI4CSJEk9YwCUJEnqGQOgJElSzxgAJUmSesYAKEmS1DMGQEmSpJ4xAEqSJPWMAVCSJKlnDICSJEk9YwCUJEnqGQOgJElSzxgAJUmSembJXDdAku6JC/7xQ3PdhCl7wov+aNJ1P/eO985gS6bfU1/14rlugqRJ8AigJElSzxgAJUmSesYAKEmS1DMGQEmSpJ4xAEqSJPWMo4AlSXPmrGPfMtdNmJIDjj1q0nU/9Oq/nMGWzIw/evub5roJmiUeAWySrEzyrSRrkxw91+2RJEmaKR4BBJJsARwPPB1YB1yU5Kyq+sbctkySpPnpvS/+s7luwpS8+L3vnusmzCsGwM5ewNqq+g5AktOBVYABUJKkHnrr8w+d6yZMyWs+eMqU6qeqZqQhC0mS5wArq+p/t+k/AZ5QVUcO1DkcOLxNPhz41iw28f7AjbO4vdm2mPu3mPsG9m8hW8x9g8Xdv8XcN7B/0+3Gqlo5XOgRwE5GlN0tGVfVicCJs9Ocu0uypqpWzMW2Z8Ni7t9i7hvYv4VsMfcNFnf/FnPfwP7NFgeBdNYBuwxMLwOunaO2SJIkzSgDYOciYHmS3ZJsCRwEnDXHbZIkSZoRngIGququJEcC5wBbACdX1eVz3KxBc3LqeRYt5v4t5r6B/VvIFnPfYHH3bzH3DezfrHAQiCRJUs94CliSJKlnDICSJEk9YwCcY0muTnJpkq8nWdPK3pbkm0kuSfLxJNuNs+yC+fm6JL+e5EtJ7kjy6gnq7ZbkgiRXJjmjDcqZ15JsneTCJBcnuTzJX4+os1Xrz9rWv11nv6WTl+TkJDckuWyg7Iy2n3697bdfH2fZeb1fJtklyXlJrmjv18uG5r86SSW5/zjLH9L2zyuTHDI7rZ688fbHJEe292TcvrV6871/o/bNHZKsbm1enWT7cZad130bk2SLJF9L8qk2fUqSqwb+/T1mnOXmff/G+c77rfb9cGmSTybZdpxl5/tny3j/9vZJ8tXW5y8k2WOc5V/b+vatJPvNeIOrysccPoCrgfsPle0LLGnP3wK8ZcRyWwDfBnYHtgQuBh451/2ZoJ87AY8H/gZ49QT1zgQOas/fCxwx122fRN8C3Kc9/1XgAmDvoTovAd7bnh8EnDHX7d5En34X2BO4bJz57wBeP6J83u+XwIOAPdvz+wL/NdZGuttBnQN8d/jfZZu/A/Cd9nf79nz7ue7TUBtH7o/AY4FdR33mLLD+bbRvAm8Fjm7Pjx7nM3Pe922gra8EPgR8qk2fAjxnE8ssiP6N8513EfDk9vyFwBtHLLcQPlvG+7f3X8AjWvlLgFNGLPvI1qetgN1aX7eYyfZ6BHAeqqrPVtVdbfLLdPclHPaLn6+rqjuBsZ+vm5eq6oaqugj47/HqJAnwVOCjrehU4MBZaN49Up0ftclfbY/h0VWr6PoDXf/2af2dl6rq88CGUfNau58HfHjE7Hm/X1bVdVX11fb8NuAKYOc2+zjgNWz8/o3ZD1hdVRuq6mZgNbDRHfbn0nj7Y1V9raqu3sTiC6F/o/bNwX9f431uzPu+ASRZBjwTeP8UF10Q/RvHw4HPt+ergf85os5C+GwZ77uggLGjmvdj9H2GVwGnV9UdVXUVsJauzzPGADj3Cvhskq+k+7m5YS8EPj2ifGfgmoHpdfzyS2yh2hG4ZSD8Lpg+tVM2XwduoPsQvmCoyi/er9a/W+n6uxD9D+D6qrpyxLwFtV+2U/GPBS5IcgDw/aq6eIJFFkT/JrE/jmdB9G+EB1TVddAFfLozDsMWSt/eRfefkJ8Plf9NusuCjkuy1YjlFkr/Rn3nXQYc0J4/l7v/MMOYBdG/cf7t/W/g7CTrgD8B3jxi0VnvnwFw7j2xqvYE9gdemuR3x2YkeR1wF/DBEctt8ufrFqAF26eq+llVPYbuaO1eSR41VGXB9m2EP2T00T9YQP1Mch/gX4CX0/07ex3w+k0tNqJs3vVvEvvjeBZE/zbTvO9bkmcBN1TVV4ZmvRb4dbrLaHYAjhq1+IiyedW/ZtR33gvb86/QXZZx54jlFkT/xvm39wrgGVW1DPgA8M4Ri856/wyAc6yqrm1/bwA+Tjvk2y7gfRbw/GoXCAyZ9z9fl+SlAxctP3gSi9wIbJdk7Abl865Pm1JVtwDns/Gpl1+8X61/92OcU6zzWWv7HwBnjFNl3u+XAEl+lS78fbCqPgY8jO66m4uTXE3X7q8meeDQoguif2Mm2B/Hs6D6N+D6JA8CaH9vGFFnIfTticABbR88HXhqkn9uly1UVd1BFyBGnRpcCP0b+Z1XVd+sqn2r6nF0/7n89ohFF0T/xgz829sf+K2Bo/BnAL8zYpFZ758BcA4luXeS+449pxv8cVmSlXT/wzugqm4fZ/F5//N1VXV8VT2mPTa5I7egex7wnFZ0CPCJmWzjdEiyNG2kdpJtgKcB3xyqdhZdf6Dr3+fGCfbz3dOAb1bVunHmz/v9sl3DeBJwRVW9E6CqLq2qnapq16rale7DeM+q+sHQ4ucA+ybZvo003beVzRuT3B/HM+/7N47Bf1/jfW7M+75V1WuralnbBw+i+5z444FwG7rrGy8bsfi8798E33k7tbJfAf6SbgDgsIXw2TLq394VwP2S/Fqr9vRWNuws4KB0d4zYDVgOXDijDZ7JESY+NjliaHe6UT8XA5cDr2vla+muBfh6e4yNHn0wcPbA8s+gG1307bFl5+sDeCDdl+oPgVva823bvLOBBw+8Jhe21+AjwFZz3fZJ9O03ga8Bl9B9ML++lb+BLsQDbN36s7b1b/e5bvcm+vRh4Dq6QTvrgMNa+SnAi4fqLqj9EngS3amVSwb+jT1jqM7VtJGKwArg/QPzXtjex7XAC+a6P1PYH/+svZd30R1ZeP8C7d9G+ybd9bTnAle2vzssxL4N9fMp/HIU8OeAS9v7+c/8cqTpguof43/nvax9ZvwX3fVxY79SttA+W8b7t/fs9v5dTHdUcPdWfgDwhoHlX9f69i1g/5lurz8FJ0mS1DOeApYkSeoZA6AkSVLPGAAlSZJ6xgAoSZLUMwZASZKknjEASuqtJMcmuXGu2yFJs80AKEmS1DMGQEmSpJ4xAErSCO1nq/4hybeS3J7kqiTHJ9l2qF4leVmSv02yPskNrd5WQ/WekuSSJD9NclGSvZLcmOTYgTpXJ3n70HKHtm3cZ4rt2j7J6Ul+nOTaJEcleXv7ndnBeg9p9Ta09Z2T5OHT8ypKmq+WzHUDJGmeuhewBd3PM62n+6H219H9pN9+Q3VfRfdzXX9M93NQ/xf4LvBWgCQ70/3k4X8Cf0H304gfBLaZwXadQvezdy8DfgC8Avg14GdjFZLsAHwBuAl4MXA7cDTw70l+rap+shntk7QAGAAlaYSqWg8cMTadZAlwFfCFJA+pqu8NVL+6qg5tz89J8kTgD2gBEHg5Xbj6/bFQleSHwBkz0a4kj6L7ndHnVdVHWr1z6X5j/EcDq3sFcG/gMVW1odX7It1vIb8QOH6q7ZO0MHgKWJLGkeRPknwtyY+A/6Y7WgbdkbRBnx2a/gawbGD68cDqoSNqZ81gu1a0v58cW6Zt+9+HVvU0YDXwwyRLWpi8DfjKwDokLUIGQEkaIcmzgdOALwHPBfYGnt1mbz1U/Zah6TuH6jyQ7nTtL1TVT7n70bjpbNcDgdvaNgatH5q+P/C/6ELk4OP36E4tS1qkPAUsSaM9F7igql4yVpDkyZu5rh8ASwcLkmwN3Geo3k+BLYfKdtiMdv0AuG+SrYdC4NKhehvojkS+cUSbbxtRJmmRMABK0mjbAHcMlT1/M9d1EfCCJNsMnAY+YES9dcAjhsqevhntWjOwjTMBkmzT1jUY7M4Fngdc7oAPqV8MgJL6bsskzxlR/nXg2CSvAy4AngHss5nbeBfwUuCTSY6jO0V7NN3AkJ8P1Ps48PdJ/oIuNP4B8BtD61oNHD9Ru6rqsiSfBE5Icl+6I4KvHLG9d9KNXP5ckr8Hvg88AHgy8IWq+vBm9lfSPGcAlNR396W7hcqwpwHvoLuNytZ0weuPgC9PdQNV9f0kzwT+DvgYcAXdKNvVwA8Hqp4IPAz4M2Arumv93gS8b6DO+4DdJ9GuQ4ETgHfTXWt4PPAdugEpY+26McnewN8AxwHbAdfRDSq5ZKr9lLRwpKrmug2S1DtJngT8P+CpVXXeLGxvCXAZ3fWDh8z09iTNbx4BlKRZkOQtwNfoTsc+HPgruqNs/zFD23su8GDgUmBb4EXAcuDgmdiepIXFAChJs2Mr4G1019jdRnfvwFdW1c8nXGrz/Rh4AbAH3S+HXEp3I+oLZ2h7khYQTwFLkiT1jDeCliRJ6hkDoCRJUs8YACVJknrGAChJktQzBkBJkqSe+f+tGM0FHj8NTQAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<Figure size 648x432 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "language = songs.groupby(['language']).size()\n",
+    "language = pd.DataFrame(language)\n",
+    "language.reset_index(level=0, inplace=True)\n",
+    "language.columns = ['Language', 'Count']\n",
+    "language = language.sort_values(by='Count', ascending=False)\n",
+    "language['Language'] = language['Language'].astype('str')\n",
+    "ax = sns.catplot(x='Language', y='Count', kind='bar', order=language['Language'],\n",
+    "                 palette=\"ch:10,-0.1,dark=.4\", data=language, height=6, aspect=1.5)\n",
+    "ax.fig.subplots_adjust(top=.9)\n",
+    "ax.fig.suptitle('Distribution of Language of the songs', fontsize=15)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "count    2.296320e+06\n",
+      "mean     4.116558e+00\n",
+      "std      2.682000e+00\n",
+      "min      3.083333e-03\n",
+      "25%      3.060000e+00\n",
+      "50%      3.777117e+00\n",
+      "75%      4.621150e+00\n",
+      "max      2.028975e+02\n",
+      "Name: song_length, dtype: float64\n"
+     ]
+    },
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 720x504 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "print((songs.song_length/60000).describe())\n",
+    "plt.subplots(figsize=(10, 7))\n",
+    "x = songs.song_length/60000\n",
+    "ax = sns.distplot(x, color='grey')\n",
+    "ax.set_title('Distribution of Song Length', fontsize=15)\n",
+    "ax.set(xlabel='Song Length', ylabel='Count')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def genres_separate(data):\n",
+    "    results = {}\n",
+    "    for i in range(len(songs.genre_ids)):\n",
+    "        data = str(songs.genre_ids[i]).split('|')\n",
+    "        for d in data:\n",
+    "            if d not in results:\n",
+    "                results[d] = 1\n",
+    "                continue\n",
+    "            results[d] += 1\n",
+    "\n",
+    "    return results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gener_count = genres_separate(songs.genre_ids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 1008x504 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "gener_df = pd.DataFrame.from_dict(\n",
+    "    gener_count, orient='index', columns=['counts'])\n",
+    "gener_df.reset_index(level=0, inplace=True)\n",
+    "gener_df = gener_df.sort_values(by='counts', ascending=False)\n",
+    "gener_df.columns = ['Genre', 'Count']\n",
+    "gener_df = gener_df[0:50]\n",
+    "ax = sns.catplot(x='Genre', y='Count', kind='bar', order=gener_df['Genre'],\n",
+    "                 palette=\"ch:7,-0.8,dark=.4\", data=gener_df, height=7, aspect=2)\n",
+    "ax.fig.subplots_adjust(top=.9)\n",
+    "ax.fig.suptitle('Distribution of Genre of the songs', fontsize=15)\n",
+    "for ax in ax.axes.flat:\n",
+    "    plt.setp(ax.get_xticklabels(), rotation=90)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Song extra info Data Exploration"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4.1 Get the distinct count number in Song extra info data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "songs_extra = pd.read_csv('song_extra_info.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 126,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total number of records:2295971\n",
+      "The distinct song_id in records:2295971\n",
+      "The distinct name in records:1168979\n",
+      "The distinct isrc in records:1806825\n"
+     ]
+    }
+   ],
+   "source": [
+    "summarize(songs_extra)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 127,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Nan_count</th>\n",
+       "      <th>Percentage</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>song_id</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>name</th>\n",
+       "      <td>2</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>isrc</th>\n",
+       "      <td>136548</td>\n",
+       "      <td>5.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         Nan_count  Percentage\n",
+       "song_id          0         0.0\n",
+       "name             2         0.0\n",
+       "isrc        136548         5.0"
+      ]
+     },
+     "execution_count": 127,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "count_nan(songs_extra)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>song_id</th>\n",
+       "      <th>name</th>\n",
+       "      <th>isrc</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=</td>\n",
+       "      <td>我們</td>\n",
+       "      <td>TWUM71200043</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=</td>\n",
+       "      <td>Let Me Love You</td>\n",
+       "      <td>QMZSY1600015</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=</td>\n",
+       "      <td>原諒我</td>\n",
+       "      <td>TWA530887303</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=</td>\n",
+       "      <td>Classic</td>\n",
+       "      <td>USSM11301446</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=</td>\n",
+       "      <td>愛投羅網</td>\n",
+       "      <td>TWA471306001</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                        song_id             name          isrc\n",
+       "0  LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=               我們  TWUM71200043\n",
+       "1  ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=  Let Me Love You  QMZSY1600015\n",
+       "2  u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=              原諒我  TWA530887303\n",
+       "3  92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=          Classic  USSM11301446\n",
+       "4  0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=             愛投羅網  TWA471306001"
+      ]
+     },
+     "execution_count": 50,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "songs_extra.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>song_id</th>\n",
+       "      <th>name</th>\n",
+       "      <th>isrc</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2295966</th>\n",
+       "      <td>hLnetpF6UbPg28sSfXnPE2vsdaGsLvddlXEdJR4VTIA=</td>\n",
+       "      <td>Deep Breathing</td>\n",
+       "      <td>PLL431720793</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2295967</th>\n",
+       "      <td>N+6vJ8actKQm0S3Fpf4elipTjoAo9ev28aA5FJN5e40=</td>\n",
+       "      <td>In Hiding</td>\n",
+       "      <td>US5UL1519827</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2295968</th>\n",
+       "      <td>pv35uG0ts05mWtirM/AMOWEzbHxIVart5ZzRXqKUY1c=</td>\n",
+       "      <td>Il Est Ne Le Divin Enfant</td>\n",
+       "      <td>PLL431502294</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2295969</th>\n",
+       "      <td>QSySnm8jt2Go7byY34/PxsZP6dPCins2j2cyYquNhBo=</td>\n",
+       "      <td>The Exodus Song</td>\n",
+       "      <td>DEPZ69316095</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2295970</th>\n",
+       "      <td>DYKJKSgDOKxb19XzOVO81176qTH0OIHCsfzFRm/BG+g=</td>\n",
+       "      <td>Like This</td>\n",
+       "      <td>US5UL1512426</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                              song_id  \\\n",
+       "2295966  hLnetpF6UbPg28sSfXnPE2vsdaGsLvddlXEdJR4VTIA=   \n",
+       "2295967  N+6vJ8actKQm0S3Fpf4elipTjoAo9ev28aA5FJN5e40=   \n",
+       "2295968  pv35uG0ts05mWtirM/AMOWEzbHxIVart5ZzRXqKUY1c=   \n",
+       "2295969  QSySnm8jt2Go7byY34/PxsZP6dPCins2j2cyYquNhBo=   \n",
+       "2295970  DYKJKSgDOKxb19XzOVO81176qTH0OIHCsfzFRm/BG+g=   \n",
+       "\n",
+       "                              name          isrc  \n",
+       "2295966             Deep Breathing  PLL431720793  \n",
+       "2295967                  In Hiding  US5UL1519827  \n",
+       "2295968  Il Est Ne Le Divin Enfant  PLL431502294  \n",
+       "2295969            The Exodus Song  DEPZ69316095  \n",
+       "2295970                  Like This  US5UL1512426  "
+      ]
+     },
+     "execution_count": 51,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "songs_extra.tail()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "song_id        2296320\n",
+       "song_length    2296320\n",
+       "genre_ids      2202204\n",
+       "artist_name    2296320\n",
+       "composer       1224966\n",
+       "lyricist        351052\n",
+       "language       2296319\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "songs.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "song_id    2295971\n",
+       "name       2295969\n",
+       "isrc       2159423\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "songs_extra.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total number of records:2295971\n",
+      "The distinct msno in records:0\n",
+      "The distinct song_id in records:2295971\n",
+      "The distinct source_system_tab in records:0\n",
+      "The distinct source_screen_name in records:0\n",
+      "The distinct source_type in records:0\n",
+      "The distinct target in records:0\n",
+      "The distinct name in records:1168980\n",
+      "The distinct isrc in records:1806826\n"
+     ]
+    }
+   ],
+   "source": [
+    "summarize(songs_extra)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def count_nan(data):\n",
+    "    total_row, total_column = data.shape\n",
+    "    total_nan = [(0, 0) for _ in range(total_column)]\n",
+    "                 \n",
+    "    df = pd.DataFrame(total_nan, columns = ['Nan_count', 'Percentage'], index = data.columns)\n",
+    "   # result.astype({'Percentage':float})\n",
+    "\n",
+    "    for column in data.columns:\n",
+    "        number_nan = data[column].isnull().sum()\n",
+    "        df.loc[column][0] = number_nan\n",
+    "        df.loc[column][1] = number_nan/total_row * 100\n",
+    "    convert_dict = {'Nan_count': int, \n",
+    "                    'Percentage': float\n",
+    "                   } \n",
+    "  \n",
+    "    df = df.astype(convert_dict) \n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tfn/notebooks/DataPreprocessing_FeatureEngineering_Pipeline_df1n19 v1.1.ipynb b/tfn/notebooks/DataPreprocessing_FeatureEngineering_Pipeline_df1n19 v1.1.ipynb
index 300137e3e2d71b93d22426f7063f3e4ac737d4fa..c62f23391cd5d20b286bcdefd0c6157a13f00407 100644
--- a/tfn/notebooks/DataPreprocessing_FeatureEngineering_Pipeline_df1n19 v1.1.ipynb	
+++ b/tfn/notebooks/DataPreprocessing_FeatureEngineering_Pipeline_df1n19 v1.1.ipynb	
@@ -1278,7 +1278,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "songs = apply_pipeline(songs, feature_pipeline_song)\n"
+    "songs = apply_pipeline(songs, feature_pipeline_song)"
    ]
   },
   {
@@ -1583,23 +1583,6 @@
    "execution_count": 16,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# Standardize numerical data: registration_duration & song_length\n",
-    "\n",
-    "transfer_list = ['song_length','registration_duration']\n",
-    "def standardize(data, transfer_list):\n",
-    "    for i in transfer_list:\n",
-    "        data[i] = preprocessing.scale(data[i], axis=0, with_mean=True, with_std=True, copy=True)\n",
-    "        \n",
-    "standardize(train, transfer_list)\n",
-    "standardize(test, transfer_list)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "# Sparse label data from train dataset\n",
     "\n",
@@ -1613,7 +1596,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -1681,7 +1664,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.420898</td>\n",
+       "      <td>2103</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
@@ -1704,7 +1687,7 @@
        "      <td>13</td>\n",
        "      <td>24.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.596334</td>\n",
+       "      <td>2301</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
@@ -1713,7 +1696,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>4</td>\n",
-       "      <td>-0.303863</td>\n",
+       "      <td>225396.0</td>\n",
        "      <td>2006.0</td>\n",
        "      <td>31</td>\n",
        "      <td>186.0</td>\n",
@@ -1727,7 +1710,7 @@
        "      <td>13</td>\n",
        "      <td>24.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.596334</td>\n",
+       "      <td>2301</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
@@ -1750,7 +1733,7 @@
        "      <td>13</td>\n",
        "      <td>24.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.596334</td>\n",
+       "      <td>2301</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
@@ -1759,7 +1742,7 @@
        "      <td>1</td>\n",
        "      <td>7</td>\n",
        "      <td>7</td>\n",
-       "      <td>-0.872754</td>\n",
+       "      <td>187802.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>2</td>\n",
        "      <td>18.0</td>\n",
@@ -1773,7 +1756,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.420898</td>\n",
+       "      <td>2103</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
@@ -1782,7 +1765,7 @@
        "      <td>1</td>\n",
        "      <td>7</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.035210</td>\n",
+       "      <td>247803.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>31</td>\n",
        "      <td>13.0</td>\n",
@@ -1796,7 +1779,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.420898</td>\n",
+       "      <td>2103</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
@@ -1805,7 +1788,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>4</td>\n",
-       "      <td>-0.234466</td>\n",
+       "      <td>229982.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>109</td>\n",
        "      <td>34.0</td>\n",
@@ -1819,7 +1802,7 @@
        "      <td>13</td>\n",
        "      <td>24.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.596334</td>\n",
+       "      <td>2301</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
@@ -1828,7 +1811,7 @@
        "      <td>1</td>\n",
        "      <td>7</td>\n",
        "      <td>7</td>\n",
-       "      <td>-0.973945</td>\n",
+       "      <td>181115.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>2</td>\n",
        "      <td>39.0</td>\n",
@@ -1842,7 +1825,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.420898</td>\n",
+       "      <td>2103</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
@@ -1851,7 +1834,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.506754</td>\n",
+       "      <td>278964.0</td>\n",
        "      <td>2007.0</td>\n",
        "      <td>58</td>\n",
        "      <td>153.0</td>\n",
@@ -1865,7 +1848,7 @@
        "      <td>15</td>\n",
        "      <td>26.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.603422</td>\n",
+       "      <td>2309</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
@@ -1874,7 +1857,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.179968</td>\n",
+       "      <td>257369.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>109</td>\n",
        "      <td>153.0</td>\n",
@@ -1888,7 +1871,7 @@
        "      <td>15</td>\n",
        "      <td>26.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.603422</td>\n",
+       "      <td>2309</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10</th>\n",
@@ -1897,7 +1880,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.326002</td>\n",
+       "      <td>223933.0</td>\n",
        "      <td>2014.0</td>\n",
        "      <td>109</td>\n",
        "      <td>49.0</td>\n",
@@ -1911,7 +1894,7 @@
        "      <td>15</td>\n",
        "      <td>26.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.603422</td>\n",
+       "      <td>2309</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>11</th>\n",
@@ -1920,7 +1903,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.315954</td>\n",
+       "      <td>224597.0</td>\n",
        "      <td>2015.0</td>\n",
        "      <td>109</td>\n",
        "      <td>79.0</td>\n",
@@ -1934,7 +1917,7 @@
        "      <td>15</td>\n",
        "      <td>26.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.603422</td>\n",
+       "      <td>2309</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>12</th>\n",
@@ -1943,7 +1926,7 @@
        "      <td>3</td>\n",
        "      <td>20</td>\n",
        "      <td>11</td>\n",
-       "      <td>0.523702</td>\n",
+       "      <td>280084.0</td>\n",
        "      <td>2014.0</td>\n",
        "      <td>109</td>\n",
        "      <td>236.0</td>\n",
@@ -1957,7 +1940,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.746018</td>\n",
+       "      <td>786</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>13</th>\n",
@@ -1980,7 +1963,7 @@
        "      <td>5</td>\n",
        "      <td>NaN</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.688482</td>\n",
+       "      <td>2405</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>14</th>\n",
@@ -1989,7 +1972,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>4</td>\n",
-       "      <td>-0.531335</td>\n",
+       "      <td>210364.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>109</td>\n",
        "      <td>236.0</td>\n",
@@ -2003,7 +1986,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.746018</td>\n",
+       "      <td>786</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>15</th>\n",
@@ -2012,7 +1995,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.073426</td>\n",
+       "      <td>240624.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>109</td>\n",
        "      <td>58815.0</td>\n",
@@ -2026,7 +2009,7 @@
        "      <td>5</td>\n",
        "      <td>NaN</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.688482</td>\n",
+       "      <td>2405</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>16</th>\n",
@@ -2035,7 +2018,7 @@
        "      <td>3</td>\n",
        "      <td>9</td>\n",
        "      <td>4</td>\n",
-       "      <td>0.050146</td>\n",
+       "      <td>248790.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>109</td>\n",
        "      <td>13.0</td>\n",
@@ -2049,7 +2032,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.746018</td>\n",
+       "      <td>786</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>17</th>\n",
@@ -2058,7 +2041,7 @@
        "      <td>3</td>\n",
        "      <td>9</td>\n",
        "      <td>4</td>\n",
-       "      <td>0.217920</td>\n",
+       "      <td>259877.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>108</td>\n",
        "      <td>308.0</td>\n",
@@ -2072,7 +2055,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.746018</td>\n",
+       "      <td>786</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>18</th>\n",
@@ -2081,7 +2064,7 @@
        "      <td>3</td>\n",
        "      <td>20</td>\n",
        "      <td>11</td>\n",
-       "      <td>0.306687</td>\n",
+       "      <td>265743.0</td>\n",
        "      <td>2013.0</td>\n",
        "      <td>109</td>\n",
        "      <td>236.0</td>\n",
@@ -2095,7 +2078,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.746018</td>\n",
+       "      <td>786</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>19</th>\n",
@@ -2104,7 +2087,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.052982</td>\n",
+       "      <td>241975.0</td>\n",
        "      <td>2006.0</td>\n",
        "      <td>109</td>\n",
        "      <td>252.0</td>\n",
@@ -2118,7 +2101,7 @@
        "      <td>9</td>\n",
        "      <td>27.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.147940</td>\n",
+       "      <td>1461</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>20</th>\n",
@@ -2127,7 +2110,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.841823</td>\n",
+       "      <td>189846.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>108</td>\n",
        "      <td>10.0</td>\n",
@@ -2141,7 +2124,7 @@
        "      <td>9</td>\n",
        "      <td>27.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.147940</td>\n",
+       "      <td>1461</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>21</th>\n",
@@ -2150,7 +2133,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.036845</td>\n",
+       "      <td>247911.0</td>\n",
        "      <td>2003.0</td>\n",
        "      <td>109</td>\n",
        "      <td>298.0</td>\n",
@@ -2164,7 +2147,7 @@
        "      <td>9</td>\n",
        "      <td>27.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.147940</td>\n",
+       "      <td>1461</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>22</th>\n",
@@ -2187,7 +2170,7 @@
        "      <td>9</td>\n",
        "      <td>27.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.147940</td>\n",
+       "      <td>1461</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>23</th>\n",
@@ -2210,7 +2193,7 @@
        "      <td>15</td>\n",
        "      <td>18.0</td>\n",
        "      <td>4</td>\n",
-       "      <td>-0.866519</td>\n",
+       "      <td>650</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>24</th>\n",
@@ -2233,7 +2216,7 @@
        "      <td>15</td>\n",
        "      <td>18.0</td>\n",
        "      <td>4</td>\n",
-       "      <td>-0.866519</td>\n",
+       "      <td>650</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>25</th>\n",
@@ -2242,7 +2225,7 @@
        "      <td>3</td>\n",
        "      <td>11</td>\n",
        "      <td>7</td>\n",
-       "      <td>-1.728693</td>\n",
+       "      <td>131239.0</td>\n",
        "      <td>2006.0</td>\n",
        "      <td>58</td>\n",
        "      <td>546.0</td>\n",
@@ -2256,7 +2239,7 @@
        "      <td>15</td>\n",
        "      <td>18.0</td>\n",
        "      <td>4</td>\n",
-       "      <td>-0.866519</td>\n",
+       "      <td>650</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>26</th>\n",
@@ -2265,7 +2248,7 @@
        "      <td>3</td>\n",
        "      <td>11</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.934035</td>\n",
+       "      <td>307200.0</td>\n",
        "      <td>1998.0</td>\n",
        "      <td>109</td>\n",
        "      <td>1.0</td>\n",
@@ -2279,7 +2262,7 @@
        "      <td>15</td>\n",
        "      <td>18.0</td>\n",
        "      <td>4</td>\n",
-       "      <td>-0.866519</td>\n",
+       "      <td>650</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>27</th>\n",
@@ -2288,7 +2271,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.477796</td>\n",
+       "      <td>213902.0</td>\n",
        "      <td>2009.0</td>\n",
        "      <td>109</td>\n",
        "      <td>231.0</td>\n",
@@ -2302,7 +2285,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.849741</td>\n",
+       "      <td>2587</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>28</th>\n",
@@ -2311,7 +2294,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.390097</td>\n",
+       "      <td>271255.0</td>\n",
        "      <td>2008.0</td>\n",
        "      <td>109</td>\n",
        "      <td>292.0</td>\n",
@@ -2325,7 +2308,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.849741</td>\n",
+       "      <td>2587</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>29</th>\n",
@@ -2334,7 +2317,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.046983</td>\n",
+       "      <td>248581.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>152</td>\n",
        "      <td>62.0</td>\n",
@@ -2348,7 +2331,7 @@
        "      <td>13</td>\n",
        "      <td>34.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.850627</td>\n",
+       "      <td>2588</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
@@ -2380,7 +2363,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.453351</td>\n",
+       "      <td>275435.0</td>\n",
        "      <td>1996.0</td>\n",
        "      <td>107</td>\n",
        "      <td>162.0</td>\n",
@@ -2394,7 +2377,7 @@
        "      <td>15</td>\n",
        "      <td>27.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.570582</td>\n",
+       "      <td>984</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377389</th>\n",
@@ -2403,7 +2386,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.094363</td>\n",
+       "      <td>251712.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>108</td>\n",
        "      <td>24.0</td>\n",
@@ -2417,7 +2400,7 @@
        "      <td>15</td>\n",
        "      <td>27.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.570582</td>\n",
+       "      <td>984</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377390</th>\n",
@@ -2426,7 +2409,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>4</td>\n",
-       "      <td>0.865757</td>\n",
+       "      <td>302688.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>108</td>\n",
        "      <td>24.0</td>\n",
@@ -2440,7 +2423,7 @@
        "      <td>15</td>\n",
        "      <td>21.0</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.103695</td>\n",
+       "      <td>1745</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377391</th>\n",
@@ -2449,7 +2432,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.347409</td>\n",
+       "      <td>268434.0</td>\n",
        "      <td>2015.0</td>\n",
        "      <td>108</td>\n",
        "      <td>136.0</td>\n",
@@ -2463,7 +2446,7 @@
        "      <td>13</td>\n",
        "      <td>41.0</td>\n",
        "      <td>7</td>\n",
-       "      <td>-0.903733</td>\n",
+       "      <td>608</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377392</th>\n",
@@ -2472,7 +2455,7 @@
        "      <td>7</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.701600</td>\n",
+       "      <td>291840.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>109</td>\n",
        "      <td>103.0</td>\n",
@@ -2486,7 +2469,7 @@
        "      <td>6</td>\n",
        "      <td>23.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>1.417693</td>\n",
+       "      <td>3228</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377393</th>\n",
@@ -2495,7 +2478,7 @@
        "      <td>7</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.053309</td>\n",
+       "      <td>248999.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>109</td>\n",
        "      <td>103.0</td>\n",
@@ -2509,7 +2492,7 @@
        "      <td>6</td>\n",
        "      <td>23.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>1.417693</td>\n",
+       "      <td>3228</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377394</th>\n",
@@ -2518,7 +2501,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.306587</td>\n",
+       "      <td>225216.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>109</td>\n",
        "      <td>58815.0</td>\n",
@@ -2532,7 +2515,7 @@
        "      <td>15</td>\n",
        "      <td>22.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.668103</td>\n",
+       "      <td>2382</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377395</th>\n",
@@ -2541,7 +2524,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.351485</td>\n",
+       "      <td>222249.0</td>\n",
        "      <td>2013.0</td>\n",
        "      <td>108</td>\n",
        "      <td>17.0</td>\n",
@@ -2555,7 +2538,7 @@
        "      <td>15</td>\n",
        "      <td>22.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.668103</td>\n",
+       "      <td>2382</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377396</th>\n",
@@ -2564,7 +2547,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.924184</td>\n",
+       "      <td>306549.0</td>\n",
        "      <td>2007.0</td>\n",
        "      <td>109</td>\n",
        "      <td>148.0</td>\n",
@@ -2578,7 +2561,7 @@
        "      <td>15</td>\n",
        "      <td>22.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.668103</td>\n",
+       "      <td>2382</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377397</th>\n",
@@ -2587,7 +2570,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.561434</td>\n",
+       "      <td>208375.0</td>\n",
        "      <td>2003.0</td>\n",
        "      <td>109</td>\n",
        "      <td>252.0</td>\n",
@@ -2601,7 +2584,7 @@
        "      <td>13</td>\n",
        "      <td>18.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.450966</td>\n",
+       "      <td>1119</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377398</th>\n",
@@ -2624,7 +2607,7 @@
        "      <td>15</td>\n",
        "      <td>27.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>2.547395</td>\n",
+       "      <td>4503</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377399</th>\n",
@@ -2647,7 +2630,7 @@
        "      <td>15</td>\n",
        "      <td>27.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>2.547395</td>\n",
+       "      <td>4503</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377400</th>\n",
@@ -2656,7 +2639,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>4</td>\n",
-       "      <td>1.000436</td>\n",
+       "      <td>311588.0</td>\n",
        "      <td>2014.0</td>\n",
        "      <td>109</td>\n",
        "      <td>308.0</td>\n",
@@ -2670,7 +2653,7 @@
        "      <td>15</td>\n",
        "      <td>27.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>2.547395</td>\n",
+       "      <td>4503</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377401</th>\n",
@@ -2679,7 +2662,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>4</td>\n",
-       "      <td>-0.169290</td>\n",
+       "      <td>234289.0</td>\n",
        "      <td>2013.0</td>\n",
        "      <td>109</td>\n",
        "      <td>133.0</td>\n",
@@ -2693,7 +2676,7 @@
        "      <td>22</td>\n",
        "      <td>29.0</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.007060</td>\n",
+       "      <td>1620</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377402</th>\n",
@@ -2702,7 +2685,7 @@
        "      <td>0</td>\n",
        "      <td>20</td>\n",
        "      <td>10</td>\n",
-       "      <td>-1.199389</td>\n",
+       "      <td>166217.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>41</td>\n",
        "      <td>58.0</td>\n",
@@ -2716,7 +2699,7 @@
        "      <td>22</td>\n",
        "      <td>37.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>2.305506</td>\n",
+       "      <td>4230</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377403</th>\n",
@@ -2725,7 +2708,7 @@
        "      <td>0</td>\n",
        "      <td>20</td>\n",
        "      <td>10</td>\n",
-       "      <td>-0.311944</td>\n",
+       "      <td>224862.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>41</td>\n",
        "      <td>1.0</td>\n",
@@ -2739,7 +2722,7 @@
        "      <td>22</td>\n",
        "      <td>37.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>2.305506</td>\n",
+       "      <td>4230</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377404</th>\n",
@@ -2748,7 +2731,7 @@
        "      <td>0</td>\n",
        "      <td>20</td>\n",
        "      <td>10</td>\n",
-       "      <td>-0.914913</td>\n",
+       "      <td>185016.0</td>\n",
        "      <td>2015.0</td>\n",
        "      <td>41</td>\n",
        "      <td>14.0</td>\n",
@@ -2762,7 +2745,7 @@
        "      <td>22</td>\n",
        "      <td>37.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>2.305506</td>\n",
+       "      <td>4230</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377405</th>\n",
@@ -2771,7 +2754,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.619754</td>\n",
+       "      <td>204521.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>96</td>\n",
        "      <td>58815.0</td>\n",
@@ -2785,7 +2768,7 @@
        "      <td>4</td>\n",
        "      <td>28.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.675191</td>\n",
+       "      <td>2390</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377406</th>\n",
@@ -2794,7 +2777,7 @@
        "      <td>1</td>\n",
        "      <td>11</td>\n",
        "      <td>7</td>\n",
-       "      <td>-0.718312</td>\n",
+       "      <td>198008.0</td>\n",
        "      <td>NaN</td>\n",
        "      <td>128</td>\n",
        "      <td>76.0</td>\n",
@@ -2808,7 +2791,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>-0.240089</td>\n",
+       "      <td>1357</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377407</th>\n",
@@ -2817,7 +2800,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.155414</td>\n",
+       "      <td>235206.0</td>\n",
        "      <td>2015.0</td>\n",
        "      <td>108</td>\n",
        "      <td>31.0</td>\n",
@@ -2831,7 +2814,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>-0.225026</td>\n",
+       "      <td>1374</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377408</th>\n",
@@ -2840,7 +2823,7 @@
        "      <td>7</td>\n",
        "      <td>15</td>\n",
        "      <td>9</td>\n",
-       "      <td>-1.059686</td>\n",
+       "      <td>175449.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>141</td>\n",
        "      <td>58815.0</td>\n",
@@ -2854,7 +2837,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.813414</td>\n",
+       "      <td>2546</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377409</th>\n",
@@ -2877,7 +2860,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.813414</td>\n",
+       "      <td>2546</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377410</th>\n",
@@ -2886,7 +2869,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-1.086380</td>\n",
+       "      <td>173685.0</td>\n",
        "      <td>2003.0</td>\n",
        "      <td>109</td>\n",
        "      <td>291.0</td>\n",
@@ -2900,7 +2883,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>-1.200557</td>\n",
+       "      <td>273</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377411</th>\n",
@@ -2909,7 +2892,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.469367</td>\n",
+       "      <td>214459.0</td>\n",
        "      <td>2016.0</td>\n",
        "      <td>109</td>\n",
        "      <td>291.0</td>\n",
@@ -2923,7 +2906,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>-1.200557</td>\n",
+       "      <td>273</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377412</th>\n",
@@ -2932,7 +2915,7 @@
        "      <td>3</td>\n",
        "      <td>8</td>\n",
        "      <td>3</td>\n",
-       "      <td>-0.237462</td>\n",
+       "      <td>229784.0</td>\n",
        "      <td>2012.0</td>\n",
        "      <td>109</td>\n",
        "      <td>291.0</td>\n",
@@ -2946,7 +2929,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>-1.200557</td>\n",
+       "      <td>273</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377413</th>\n",
@@ -2969,7 +2952,7 @@
        "      <td>4</td>\n",
        "      <td>26.0</td>\n",
        "      <td>9</td>\n",
-       "      <td>2.694478</td>\n",
+       "      <td>4669</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377414</th>\n",
@@ -2978,7 +2961,7 @@
        "      <td>7</td>\n",
        "      <td>15</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.273477</td>\n",
+       "      <td>227404.0</td>\n",
        "      <td>2015.0</td>\n",
        "      <td>109</td>\n",
        "      <td>1.0</td>\n",
@@ -2992,7 +2975,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>-0.058450</td>\n",
+       "      <td>1562</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377415</th>\n",
@@ -3001,7 +2984,7 @@
        "      <td>7</td>\n",
        "      <td>15</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.194026</td>\n",
+       "      <td>258298.0</td>\n",
        "      <td>2015.0</td>\n",
        "      <td>109</td>\n",
        "      <td>9.0</td>\n",
@@ -3015,7 +2998,7 @@
        "      <td>1</td>\n",
        "      <td>NaN</td>\n",
        "      <td>7</td>\n",
-       "      <td>-0.058450</td>\n",
+       "      <td>1562</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377416</th>\n",
@@ -3024,7 +3007,7 @@
        "      <td>0</td>\n",
        "      <td>5</td>\n",
        "      <td>7</td>\n",
-       "      <td>4.216968</td>\n",
+       "      <td>524146.0</td>\n",
        "      <td>2007.0</td>\n",
        "      <td>73</td>\n",
        "      <td>38.0</td>\n",
@@ -3038,7 +3021,7 @@
        "      <td>5</td>\n",
        "      <td>NaN</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.602479</td>\n",
+       "      <td>948</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7377417</th>\n",
@@ -3047,7 +3030,7 @@
        "      <td>0</td>\n",
        "      <td>5</td>\n",
        "      <td>7</td>\n",
-       "      <td>0.129380</td>\n",
+       "      <td>254026.0</td>\n",
        "      <td>1999.0</td>\n",
        "      <td>72</td>\n",
        "      <td>3.0</td>\n",
@@ -3061,7 +3044,7 @@
        "      <td>5</td>\n",
        "      <td>NaN</td>\n",
        "      <td>9</td>\n",
-       "      <td>-0.602479</td>\n",
+       "      <td>948</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -3085,15 +3068,15 @@
        "         song_length  song_year  first_genre_type  artist_count  \\\n",
        "0                NaN        NaN               152           NaN   \n",
        "1                NaN        NaN               152           NaN   \n",
-       "2          -0.303863     2006.0                31         186.0   \n",
+       "2           225396.0     2006.0                31         186.0   \n",
        "3                NaN        NaN               152           NaN   \n",
-       "4          -0.872754     2016.0                 2          18.0   \n",
+       "4           187802.0     2016.0                 2          18.0   \n",
        "...              ...        ...               ...           ...   \n",
        "7377413          NaN        NaN               152           NaN   \n",
-       "7377414    -0.273477     2015.0               109           1.0   \n",
-       "7377415     0.194026     2015.0               109           9.0   \n",
-       "7377416     4.216968     2007.0                73          38.0   \n",
-       "7377417     0.129380     1999.0                72           3.0   \n",
+       "7377414     227404.0     2015.0               109           1.0   \n",
+       "7377415     258298.0     2015.0               109           9.0   \n",
+       "7377416     524146.0     2007.0                73          38.0   \n",
+       "7377417     254026.0     1999.0                72           3.0   \n",
        "\n",
        "         composer_count  lyricist_count  first_genre_typecount  featured_song  \\\n",
        "0                   NaN             NaN                    NaN            NaN   \n",
@@ -3122,22 +3105,22 @@
        "7377417       1.0       0.0            52     5   NaN               9   \n",
        "\n",
        "         registration_duration  \n",
-       "0                     0.420898  \n",
-       "1                     0.596334  \n",
-       "2                     0.596334  \n",
-       "3                     0.596334  \n",
-       "4                     0.420898  \n",
+       "0                         2103  \n",
+       "1                         2301  \n",
+       "2                         2301  \n",
+       "3                         2301  \n",
+       "4                         2103  \n",
        "...                        ...  \n",
-       "7377413               2.694478  \n",
-       "7377414              -0.058450  \n",
-       "7377415              -0.058450  \n",
-       "7377416              -0.602479  \n",
-       "7377417              -0.602479  \n",
+       "7377413                   4669  \n",
+       "7377414                   1562  \n",
+       "7377415                   1562  \n",
+       "7377416                    948  \n",
+       "7377417                    948  \n",
        "\n",
        "[7377418 rows x 20 columns]"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3148,7 +3131,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3169,7 +3152,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
@@ -3178,18 +3161,18 @@
        "1"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "save(train, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\train_processed.pkl')"
+    "save(data_train, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\data_train.pkl')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
@@ -3198,32 +3181,61 @@
        "1"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "save(test, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\test_processed.pkl')"
+    "save(data_val, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\data_val.pkl')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 27,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "train_pkl = load(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\train.pkl')"
+    "save(label_train, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\label_train.pkl')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 28,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "test_pkl = load(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\test.pkl')"
+    "save(label_val, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\label_val.pkl')"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/feature extraction/.ipynb_checkpoints/DataPreprocessing_FeatureEngineering_Pipeline_df1n19-checkpoint.ipynb b/tfn/notebooks/DataPreprocessing_FeatureEngineering_Pipeline_df1n19_SVM v1.ipynb
similarity index 77%
rename from feature extraction/.ipynb_checkpoints/DataPreprocessing_FeatureEngineering_Pipeline_df1n19-checkpoint.ipynb
rename to tfn/notebooks/DataPreprocessing_FeatureEngineering_Pipeline_df1n19_SVM v1.ipynb
index 2ecaf3a83da5ab997212856432ff38704638c5e1..70103609fab17932a97c1e95dca652847cc49b75 100644
--- a/feature extraction/.ipynb_checkpoints/DataPreprocessing_FeatureEngineering_Pipeline_df1n19-checkpoint.ipynb	
+++ b/tfn/notebooks/DataPreprocessing_FeatureEngineering_Pipeline_df1n19_SVM v1.ipynb	
@@ -12,7 +12,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 89,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -25,149 +25,63 @@
     "import re \n",
     "\n",
     "from sklearn.model_selection import train_test_split\n",
-    "from sklearn import preprocessing\n"
+    "from sklearn import preprocessing"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": []
-  },
-  {
-   "cell_type": "raw",
-   "metadata": {},
-   "source": []
+   "source": [
+    "## 1. Loading Data"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 100,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "id = members.msno[16867]"
+    "# author@Fiona\n",
+    "# read data from csv files\n",
+    "members = pd.read_csv(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\members.csv',parse_dates=['registration_init_time','expiration_date'])\n",
+    "songs = pd.read_csv(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\songs.csv')\n",
+    "songs_extra = pd.read_csv(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\song_extra_info.csv')\n",
+    "train = pd.read_csv(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\train.csv')\n",
+    "test = pd.read_csv(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\test.csv')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 102,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\users\\user\\miniconda3\\lib\\site-packages\\ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
+     ]
+    },
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>msno</th>\n",
-       "      <th>song_id</th>\n",
-       "      <th>source_system_tab</th>\n",
-       "      <th>source_screen_name</th>\n",
-       "      <th>source_type</th>\n",
-       "      <th>target</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>2487533</th>\n",
-       "      <td>1Y+bNz3FxSoJnKOcR/Q8VJGXZbWIstrW0HfBe5LZzKA=</td>\n",
-       "      <td>WznMG5LmzE4k7q1OQLPAV2s96k8ZIrVvG/rihErlYWk=</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2487534</th>\n",
-       "      <td>1Y+bNz3FxSoJnKOcR/Q8VJGXZbWIstrW0HfBe5LZzKA=</td>\n",
-       "      <td>DdKsqy3JAygpcHwihcjBKzzp8SDYhdtXbEZmhKDrOSo=</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2487535</th>\n",
-       "      <td>1Y+bNz3FxSoJnKOcR/Q8VJGXZbWIstrW0HfBe5LZzKA=</td>\n",
-       "      <td>xEjg9Bs0QcYD3BBQrzPUk89Eb2jBCWu/aki+pOy6H0w=</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "                                                 msno  \\\n",
-       "2487533  1Y+bNz3FxSoJnKOcR/Q8VJGXZbWIstrW0HfBe5LZzKA=   \n",
-       "2487534  1Y+bNz3FxSoJnKOcR/Q8VJGXZbWIstrW0HfBe5LZzKA=   \n",
-       "2487535  1Y+bNz3FxSoJnKOcR/Q8VJGXZbWIstrW0HfBe5LZzKA=   \n",
-       "\n",
-       "                                              song_id source_system_tab  \\\n",
-       "2487533  WznMG5LmzE4k7q1OQLPAV2s96k8ZIrVvG/rihErlYWk=               NaN   \n",
-       "2487534  DdKsqy3JAygpcHwihcjBKzzp8SDYhdtXbEZmhKDrOSo=               NaN   \n",
-       "2487535  xEjg9Bs0QcYD3BBQrzPUk89Eb2jBCWu/aki+pOy6H0w=               NaN   \n",
-       "\n",
-       "        source_screen_name source_type  target  \n",
-       "2487533                NaN         NaN       0  \n",
-       "2487534                NaN         NaN       0  \n",
-       "2487535                NaN         NaN       0  "
+       "Timestamp('2014-05-01 00:00:00')"
       ]
      },
-     "execution_count": 102,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "train[train.msno == id]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1. Loading Data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 90,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# author@Fiona\n",
-    "# read data from csv files\n",
-    "members = pd.read_csv(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\members.csv',parse_dates=['registration_init_time','expiration_date'])\n",
-    "# members = pd.read_csv('members.csv')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 91,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "songs = pd.read_csv(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\songs.csv')\n",
-    "songs_extra = pd.read_csv(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\song_extra_info.csv')\n",
-    "train = pd.read_csv(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\train.csv')\n",
-    "test = pd.read_csv(r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\test.csv')"
+    "# Outlier preprocess for members.expiration_date row 16867\n",
+    "\n",
+    "members.expiration_date[16867] = members.registration_init_time[16867]\n",
+    "members.expiration_date[16867]"
    ]
   },
   {
@@ -179,7 +93,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 92,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -1008,7 +922,7 @@
        "[1048575 rows x 9 columns]"
       ]
      },
-     "execution_count": 92,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1032,7 +946,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1159,19 +1073,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Outlier preprocess for members.expiration_date row 16867\n",
-    "\n",
-    "members.expiration_date[16867] = members.registration_init_time[16867]\n",
-    "members.expiration_date[16867]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1182,7 +1084,7 @@
     "    members.registration_init_time = members.registration_init_time.apply(lambda x : time.mktime(x.timetuple()))\n",
     "    members.expiration_date = members.expiration_date.apply(lambda x : time.mktime(x.timetuple()))\n",
     "    #count Duration\n",
-    "    members['registration_duration'] = ((members['expiration_date'] -members['registration_init_time'])/(24*60*60))\n",
+    "    members['registration_duration'] = ((members['expiration_date'] - members['registration_init_time'])/(24*60*60))\n",
     "    members['registration_duration'] = members['registration_duration'].apply(int)\n",
     "    return members\n",
     "\n",
@@ -1208,7 +1110,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1225,7 +1127,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -1348,7 +1250,7 @@
        "4      52.0  Mary Had a Little Lamb           NaN  "
       ]
      },
-     "execution_count": 13,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1359,7 +1261,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1372,16 +1274,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
-    "songs = apply_pipeline(songs, feature_pipeline_song)\n"
+    "songs = apply_pipeline(songs, feature_pipeline_song)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -1522,7 +1424,7 @@
        "4                  16114              0         1         0            52  "
       ]
      },
-     "execution_count": 16,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1533,7 +1435,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1542,16 +1444,110 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>msno</th>\n",
+       "      <th>city</th>\n",
+       "      <th>bd</th>\n",
+       "      <th>registered_via</th>\n",
+       "      <th>registration_duration</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>7</td>\n",
+       "      <td>2223</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>7</td>\n",
+       "      <td>725</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4</td>\n",
+       "      <td>457</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>9</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4</td>\n",
+       "      <td>138</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                           msno  city  bd  registered_via  \\\n",
+       "0  XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=     1 NaN               7   \n",
+       "1  UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=     1 NaN               7   \n",
+       "2  D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=     1 NaN               4   \n",
+       "3  mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=     1 NaN               9   \n",
+       "4  q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=     1 NaN               4   \n",
+       "\n",
+       "   registration_duration  \n",
+       "0                   2223  \n",
+       "1                    725  \n",
+       "2                    457  \n",
+       "3                      1  \n",
+       "4                    138  "
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "members.head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1565,7 +1561,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Drop composer count and lyristics count\n",
+    "train = train.drop('lyricist_count', axis = 1)\n",
+    "train = train.drop('composer_count', axis = 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test = test.drop('lyricist_count', axis = 1)\n",
+    "test = test.drop('composer_count', axis = 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1577,26 +1594,86 @@
     "    for i in transfer:\n",
     "        data[i] = np.array(le.fit_transform(data[i].tolist()))\n",
     "\n",
-    "\n",
     "labelencoding(train, transfer)\n",
     "labelencoding(test, transfer)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# training and validation dataset split\n"
+    "# 缺失值处理\n",
+    "# fillnan data with mode\n",
+    "missing_attribute_list = dict(train.isnull().any())\n",
+    "attribute_name = [i for i in list(missing_attribute_list.keys()) if missing_attribute_list[i] == True ]\n",
+    "for i in attribute_name:\n",
+    "    train[i].fillna(train[i].mode()[0], inplace = True)\n",
+    "    test[i].fillna(test[i].mode()[0], inplace = True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
+    "#np.array(train.song_id).reshape(-1,1).ravel()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Sparse label data from train dataset\n",
+    "\n",
+    "label = train.target.tolist()\n",
+    "train = train.drop('target', axis = 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Normalise numerical data: registration_duration & song_length\n",
+    "\n",
+    "transfer_list = train.columns.tolist()\n",
+    "def normalize(data, transfer_list):\n",
+    "    min_max_scaler = preprocessing.MinMaxScaler()\n",
+    "    for i in transfer_list:\n",
+    "        temp = min_max_scaler.fit_transform(np.array(data[i]).reshape(-1,1))\n",
+    "        data[i] = temp.ravel()\n",
+    "        #data[i] = min_max_scaler(data[i])\n",
+    "        \n",
+    "normalize(train, transfer_list)\n",
+    "normalize(test, transfer_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "# training and validation dataset split 70:30 & shuffle\n",
+    "\n",
+    "data_train, data_val, label_train, label_val = train_test_split(train, label, test_size=0.3, shuffle=True )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save and load the data using pickle (Optional)\n",
     "import pickle\n",
     "\n",
     "def save(obj, filename):\n",
@@ -1613,7 +1690,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
@@ -1622,18 +1699,18 @@
        "1"
       ]
      },
-     "execution_count": 83,
+     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "save(train, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\train.pkl')"
+    "save(data_train, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\data_train.pkl')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [
     {
@@ -1642,445 +1719,53 @@
        "1"
       ]
      },
-     "execution_count": 84,
+     "execution_count": 33,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "save(test, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\test.pkl')"
+    "save(data_val, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\data_val.pkl')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>msno</th>\n",
-       "      <th>song_id</th>\n",
-       "      <th>source_system_tab</th>\n",
-       "      <th>source_screen_name</th>\n",
-       "      <th>source_type</th>\n",
-       "      <th>target</th>\n",
-       "      <th>song_length</th>\n",
-       "      <th>song_year</th>\n",
-       "      <th>first_genre_type</th>\n",
-       "      <th>artist_count</th>\n",
-       "      <th>...</th>\n",
-       "      <th>lyricist_count</th>\n",
-       "      <th>first_genre_typecount</th>\n",
-       "      <th>featured_song</th>\n",
-       "      <th>same_c_l</th>\n",
-       "      <th>all_same</th>\n",
-       "      <th>song_language</th>\n",
-       "      <th>city</th>\n",
-       "      <th>bd</th>\n",
-       "      <th>registered_via</th>\n",
-       "      <th>registration_duration</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>8158</td>\n",
-       "      <td>74679</td>\n",
-       "      <td>explore</td>\n",
-       "      <td>Explore</td>\n",
-       "      <td>online-playlist</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>7</td>\n",
-       "      <td>2103</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>17259</td>\n",
-       "      <td>223479</td>\n",
-       "      <td>my library</td>\n",
-       "      <td>Local playlist more</td>\n",
-       "      <td>local-playlist</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>13</td>\n",
-       "      <td>24.0</td>\n",
-       "      <td>9</td>\n",
-       "      <td>2301</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>17259</td>\n",
-       "      <td>120758</td>\n",
-       "      <td>my library</td>\n",
-       "      <td>Local playlist more</td>\n",
-       "      <td>local-playlist</td>\n",
-       "      <td>1</td>\n",
-       "      <td>225396.0</td>\n",
-       "      <td>2006.0</td>\n",
-       "      <td>1259</td>\n",
-       "      <td>186.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>44584.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>52</td>\n",
-       "      <td>13</td>\n",
-       "      <td>24.0</td>\n",
-       "      <td>9</td>\n",
-       "      <td>2301</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>17259</td>\n",
-       "      <td>23707</td>\n",
-       "      <td>my library</td>\n",
-       "      <td>Local playlist more</td>\n",
-       "      <td>local-playlist</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>13</td>\n",
-       "      <td>24.0</td>\n",
-       "      <td>9</td>\n",
-       "      <td>2301</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>8158</td>\n",
-       "      <td>33308</td>\n",
-       "      <td>explore</td>\n",
-       "      <td>Explore</td>\n",
-       "      <td>online-playlist</td>\n",
-       "      <td>1</td>\n",
-       "      <td>187802.0</td>\n",
-       "      <td>2016.0</td>\n",
-       "      <td>1011</td>\n",
-       "      <td>18.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>13030.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>52</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>7</td>\n",
-       "      <td>2103</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5 rows × 21 columns</p>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "    msno  song_id source_system_tab   source_screen_name      source_type  \\\n",
-       "0   8158    74679           explore              Explore  online-playlist   \n",
-       "1  17259   223479        my library  Local playlist more   local-playlist   \n",
-       "2  17259   120758        my library  Local playlist more   local-playlist   \n",
-       "3  17259    23707        my library  Local playlist more   local-playlist   \n",
-       "4   8158    33308           explore              Explore  online-playlist   \n",
-       "\n",
-       "   target  song_length  song_year first_genre_type  artist_count  ...  \\\n",
-       "0       1          NaN        NaN              NaN           NaN  ...   \n",
-       "1       1          NaN        NaN              NaN           NaN  ...   \n",
-       "2       1     225396.0     2006.0             1259         186.0  ...   \n",
-       "3       1          NaN        NaN              NaN           NaN  ...   \n",
-       "4       1     187802.0     2016.0             1011          18.0  ...   \n",
-       "\n",
-       "   lyricist_count  first_genre_typecount  featured_song  same_c_l  all_same  \\\n",
-       "0             NaN                    NaN            NaN       NaN       NaN   \n",
-       "1             NaN                    NaN            NaN       NaN       NaN   \n",
-       "2             NaN                44584.0            0.0       0.0       0.0   \n",
-       "3             NaN                    NaN            NaN       NaN       NaN   \n",
-       "4             NaN                13030.0            0.0       0.0       0.0   \n",
-       "\n",
-       "   song_language city    bd  registered_via  registration_duration  \n",
-       "0            NaN    1   NaN               7                   2103  \n",
-       "1            NaN   13  24.0               9                   2301  \n",
-       "2             52   13  24.0               9                   2301  \n",
-       "3            NaN   13  24.0               9                   2301  \n",
-       "4             52    1   NaN               7                   2103  \n",
-       "\n",
-       "[5 rows x 21 columns]"
+       "1"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 34,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "train.head()"
+    "save(label_train, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\label_train.pkl')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>msno</th>\n",
-       "      <th>song_id</th>\n",
-       "      <th>source_system_tab</th>\n",
-       "      <th>source_screen_name</th>\n",
-       "      <th>source_type</th>\n",
-       "      <th>song_length</th>\n",
-       "      <th>song_year</th>\n",
-       "      <th>first_genre_type</th>\n",
-       "      <th>artist_count</th>\n",
-       "      <th>...</th>\n",
-       "      <th>lyricist_count</th>\n",
-       "      <th>first_genre_typecount</th>\n",
-       "      <th>featured_song</th>\n",
-       "      <th>same_c_l</th>\n",
-       "      <th>all_same</th>\n",
-       "      <th>song_language</th>\n",
-       "      <th>city</th>\n",
-       "      <th>bd</th>\n",
-       "      <th>registered_via</th>\n",
-       "      <th>registration_duration</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0</td>\n",
-       "      <td>12934</td>\n",
-       "      <td>122191</td>\n",
-       "      <td>3</td>\n",
-       "      <td>8</td>\n",
-       "      <td>3</td>\n",
-       "      <td>224130.0</td>\n",
-       "      <td>2014.0</td>\n",
-       "      <td>103</td>\n",
-       "      <td>77.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>11233.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>3</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>7</td>\n",
-       "      <td>577</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1</td>\n",
-       "      <td>12934</td>\n",
-       "      <td>217907</td>\n",
-       "      <td>3</td>\n",
-       "      <td>8</td>\n",
-       "      <td>3</td>\n",
-       "      <td>320470.0</td>\n",
-       "      <td>2010.0</td>\n",
-       "      <td>104</td>\n",
-       "      <td>236.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>304098.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>3</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>7</td>\n",
-       "      <td>577</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2</td>\n",
-       "      <td>712</td>\n",
-       "      <td>37385</td>\n",
-       "      <td>0</td>\n",
-       "      <td>22</td>\n",
-       "      <td>10</td>\n",
-       "      <td>315899.0</td>\n",
-       "      <td>2010.0</td>\n",
-       "      <td>55</td>\n",
-       "      <td>76.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>66.0</td>\n",
-       "      <td>75940.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>17</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>4</td>\n",
-       "      <td>7</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>3</td>\n",
-       "      <td>1383</td>\n",
-       "      <td>224360</td>\n",
-       "      <td>6</td>\n",
-       "      <td>16</td>\n",
-       "      <td>8</td>\n",
-       "      <td>285210.0</td>\n",
-       "      <td>2002.0</td>\n",
-       "      <td>104</td>\n",
-       "      <td>288.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>304098.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>52</td>\n",
-       "      <td>3</td>\n",
-       "      <td>30.0</td>\n",
-       "      <td>9</td>\n",
-       "      <td>3567</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>4</td>\n",
-       "      <td>1383</td>\n",
-       "      <td>85597</td>\n",
-       "      <td>6</td>\n",
-       "      <td>16</td>\n",
-       "      <td>8</td>\n",
-       "      <td>197590.0</td>\n",
-       "      <td>2011.0</td>\n",
-       "      <td>132</td>\n",
-       "      <td>20.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>9111.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>3</td>\n",
-       "      <td>30.0</td>\n",
-       "      <td>9</td>\n",
-       "      <td>3567</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5 rows × 21 columns</p>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "   id   msno  song_id  source_system_tab  source_screen_name  source_type  \\\n",
-       "0   0  12934   122191                  3                   8            3   \n",
-       "1   1  12934   217907                  3                   8            3   \n",
-       "2   2    712    37385                  0                  22           10   \n",
-       "3   3   1383   224360                  6                  16            8   \n",
-       "4   4   1383    85597                  6                  16            8   \n",
-       "\n",
-       "   song_length  song_year  first_genre_type  artist_count  ...  \\\n",
-       "0     224130.0     2014.0               103          77.0  ...   \n",
-       "1     320470.0     2010.0               104         236.0  ...   \n",
-       "2     315899.0     2010.0                55          76.0  ...   \n",
-       "3     285210.0     2002.0               104         288.0  ...   \n",
-       "4     197590.0     2011.0               132          20.0  ...   \n",
-       "\n",
-       "   lyricist_count  first_genre_typecount  featured_song  same_c_l  all_same  \\\n",
-       "0             NaN                11233.0            0.0       0.0       0.0   \n",
-       "1             2.0               304098.0            0.0       0.0       0.0   \n",
-       "2            66.0                75940.0            0.0       1.0       0.0   \n",
-       "3             NaN               304098.0            0.0       0.0       0.0   \n",
-       "4             NaN                 9111.0            0.0       0.0       0.0   \n",
-       "\n",
-       "   song_language city    bd  registered_via  registration_duration  \n",
-       "0              3    1   NaN               7                    577  \n",
-       "1              3    1   NaN               7                    577  \n",
-       "2             17    1   NaN               4                      7  \n",
-       "3             52    3  30.0               9                   3567  \n",
-       "4            NaN    3  30.0               9                   3567  \n",
-       "\n",
-       "[5 rows x 21 columns]"
+       "1"
       ]
      },
-     "execution_count": 85,
+     "execution_count": 35,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "test.head()"
+    "save(label_val, r'D:\\Project\\DS\\Data Mining\\cw1\\kkbox_recommendation\\tfn\\data\\label_val.pkl')"
    ]
   },
   {
diff --git a/tfn/notebooks/EDA_continued.ipynb b/tfn/notebooks/EDA_continued.ipynb
index 5963a5358fe0a77db9a5760ddb4fdde66af8802d..3d0129935bfa1f75628eb9c93e58c4e34a2b180a 100644
--- a/tfn/notebooks/EDA_continued.ipynb
+++ b/tfn/notebooks/EDA_continued.ipynb
@@ -2053,7 +2053,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.7.3"
   }
  },
  "nbformat": 4,
diff --git a/tfn/notebooks/count number v1.xlsx b/tfn/notebooks/count number v1.xlsx
deleted file mode 100644
index c6c0aa6b214b05699c1b0561bfedb625908d58c1..0000000000000000000000000000000000000000
Binary files a/tfn/notebooks/count number v1.xlsx and /dev/null differ
diff --git a/tfn/notebooks/debug/members_expire_date_16867.PNG b/tfn/notebooks/debug/members_expire_date_16867.PNG
deleted file mode 100644
index 5e8570e517dd2e8e660de46ded0a1ed2e40ce981..0000000000000000000000000000000000000000
Binary files a/tfn/notebooks/debug/members_expire_date_16867.PNG and /dev/null differ