Initial commit

bff68516 · ckf1n19 · a325a53e · a325a53e · a325a53e · bff68516
Commit bff68516 authored 5 years ago by ckf1n19
--- a/tfn/feature_extraction/members.py
+++ b/tfn/feature_extraction/members.py
--- a/tfn/feature_extraction/songs.py
+++ b/tfn/feature_extraction/songs.py
-import numpy as np
-import pandas as pd
-import datetime
-import time
-import re 
-
-
-# author@Fiona
-# song feature extraction functions 
-
-# get song_year feature from isrc code
-def isrc_to_year(isrc):
-    if type(isrc) == str:
-        if int(isrc[5:7]) > 17:
-            return 1900 + int(isrc[5:7])
-        else:
-            return 2000 + int(isrc[5:7])
-    else:
-        return np.nan
-
-# add new feature 
-def add_song_year(songs):
-    songs['song_year'] = songs['isrc'].apply(isrc_to_year)
-    return songs
-    
-# add genre id counts feature 'popular genres'
-def add_first_genre_type(songs):
-    songs['first_genre_type'] = songs.genre_ids.apply(str).apply(lambda attribute: attribute.split('|')[0])
-    return songs
-
-# get song played counts
-# 循环完成计数并添加到新字典变量song_played_counts中 
-# 再把song_played_counts和原表根据song_id拼接起来
-#dictionary used to save times a song is played
-#key = song_id, value = number of times the song's played
-
-def song_played_counts(songs):
-    song_played_counts_dic = {}
-    for song_id in songs['song_id']:
-        if song_id not in song_played_counts_dic:
-            song_played_counts_dic[song_id] = 1
-        else:
-            song_played_counts_dic[song_id] += 1
-    
-    return song_played_counts_dic
-
-# add song played counts feature 'hit songs'
-def add_song_played_times(songs):
-    song_played_times = song_played_counts(songs)
-    new_song_played_times = pd.DataFrame(pd.Series(song_played_times), columns = ['song_played_times'])
-    new_song_played_times = new_song_played_times.reset_index().rename(columns = {'index' : 'song_id'})
-    songs = songs.merge(new_song_played_times, on = 'song_id', how = 'left')
-    return songs
-    
-# add artist counts feature 'hot artist'
-def add_artist_counts(songs):
-    artistcount = songs.groupby(['artist_name'],as_index=False)['artist_name'].agg({'artist_count':'count'})
-    songs = songs.merge(artistcount, on = 'artist_name', how = 'left')
-    return songs
-
-# add composer played counts
-## Count the number of times the composer has been listened
-def add_composer_counts(songs):
-    composercount = songs.groupby(['composer'],as_index=False)['composer'].agg({'composer_count':'count'})
-    songs = songs.merge(composercount, on = 'composer', how = 'left')
-    return songs
-
-
-#add lyricist played counts
-# Count the number of times the lyricist has been listened
-def add_lyricist_counts(songs):
-    lyricistcount = songs.groupby(['lyricist'],as_index=False)['lyricist'].agg({'lyricist_count':'count'})
-    songs = songs.merge(lyricistcount, on = 'lyricist', how = 'left')
-    return songs
-
-# add genre type counts 'popular genres'
-# 分组要使用first genre type属性
-def add_genere_counts(songs):
-    genrecount = songs.groupby(['first_genre_type'],as_index = False)['first_genre_type'].agg({'first_genre_typecount':'count'})
-    songs = songs.merge(genrecount, on = 'first_genre_type', how = 'left')
-    return songs
-
-
-# add feat feature 
-def add_feat_feature(songs):
-    songs['featured_song'] = songs.apply(lambda attribute : 1 if 'feat' in str(attribute['artist_name'])else 0,
-                                        axis = 1)
-    return songs
-    
-
-# add if_artist_composer_lyricist_are_same feature:
-def add_is_same_feature(songs):
-    songs['same_c_l'] = songs.apply(lambda attribute : 1 if attribute['composer'] == attribute['lyricist'] else 0, 
-                                    axis = 1 )
-    songs['all_same'] = songs.apply(lambda attribute : 1 if attribute['artist_name'] == attribute['composer'] and 
-                                   attribute['composer'] == attribute['lyricist'] else 0, axis = 1)
-    return songs
-
-
-#deal with missing values
-def songs_language_to_str(language):
-    if language == -1.0 or np.isnan(language):
-        return np.nan
-    elif type(language) == float:
-        return str(int(language)) 
-    else:
-        return np.nan
-    
-def missing_value(songs):
-    #deal with song_length missing value
-    songs.song_length.fillna(songs.song_length.mean(), inplace = True)
-    # deal with language missing value
-    songs['song_language'] = songs['language'].apply(songs_language_to_str)
-    return songs
-
-    
-#delete unuseful features
-def del_unuseful_feature(songs):
-    songs = songs.drop('isrc', axis = 1)
-    songs = songs.drop('name', axis = 1)
-    songs = songs.drop('genre_ids', axis = 1)
-    songs = songs.drop(['artist_name', 'composer', 'lyricist'], axis = 1)
-    songs = songs.drop('language', axis = 1)
-    return songs
-    
-# finish song features extraction
\ No newline at end of file
--- a/tfn/pickle_file.py
+++ b/tfn/pickle_file.py
+import pickle
+
+def save(obj, filename):
+    with open(filename, 'wb') as output:  # Overwrites any existing file.
+        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
+    return 1
+
+##-- Load obj from file    
+def load(filename):
+    with open(filename, 'rb') as input: 
+        obj = pickle.load(input)
+    return obj  
\ No newline at end of file
--- a/tfn/preprocess.py
+++ b/tfn/preprocess.py
+from tfn import TRAIN_FILE, TEST_FILE, MEMBERS_FILE, SONGS_FILE, SONGS_EXTRA_FILE
+
+import numpy as np
+import pandas as pd
+import datetime
+import time
+import re 
+
+from sklearn.model_selection import train_test_split
+from sklearn import preprocessing
+
+def _get_training_data_from_csv():
+    songs =        pd.read_csv(TRAIN_FILE, header=0)
+    test =         pd.read_csv(TEST_FILE, header=0)
+    members =      pd.read_csv(MEMBERS_FILE, header=0)
+    songs =        pd.read_csv(SONGS_FILE,, header=0)
+    songs_extra =  pd.read_csv(SONGS_EXTRA_FILE, header=0)
+
+    return songs, test, members, songs, songs_extra
+
+
+if __name__ == '__main__':
+    songs, test, members, songs, songs_extra = _get_training_data_from_csv()
+    songs0 = songs
+    songs_extra0 = songs_extra
+
+    songs = songs0.head(1048575)
+    songs_extra = songs_extra0.head(1048575)
+    del songs0
+    del songs_extra0
+
+    # merge songinfo file
+    songs = songs.merge(songs_extra, on = 'song_id', how = 'left')
+    del songs_extra
\ No newline at end of file