Skip to content
Snippets Groups Projects
Commit bff68516 authored by ckf1n19's avatar ckf1n19
Browse files

Initial commit

parent a325a53e
No related branches found
No related tags found
No related merge requests found
import numpy as np
import pandas as pd
import datetime
import time
import re
# author@Fiona
# song feature extraction functions
# get song_year feature from isrc code
def isrc_to_year(isrc):
if type(isrc) == str:
if int(isrc[5:7]) > 17:
return 1900 + int(isrc[5:7])
else:
return 2000 + int(isrc[5:7])
else:
return np.nan
# add new feature
def add_song_year(songs):
songs['song_year'] = songs['isrc'].apply(isrc_to_year)
return songs
# add genre id counts feature 'popular genres'
def add_first_genre_type(songs):
songs['first_genre_type'] = songs.genre_ids.apply(str).apply(lambda attribute: attribute.split('|')[0])
return songs
# get song played counts
# 循环完成计数并添加到新字典变量song_played_counts中
# 再把song_played_counts和原表根据song_id拼接起来
#dictionary used to save times a song is played
#key = song_id, value = number of times the song's played
def song_played_counts(songs):
song_played_counts_dic = {}
for song_id in songs['song_id']:
if song_id not in song_played_counts_dic:
song_played_counts_dic[song_id] = 1
else:
song_played_counts_dic[song_id] += 1
return song_played_counts_dic
# add song played counts feature 'hit songs'
def add_song_played_times(songs):
song_played_times = song_played_counts(songs)
new_song_played_times = pd.DataFrame(pd.Series(song_played_times), columns = ['song_played_times'])
new_song_played_times = new_song_played_times.reset_index().rename(columns = {'index' : 'song_id'})
songs = songs.merge(new_song_played_times, on = 'song_id', how = 'left')
return songs
# add artist counts feature 'hot artist'
def add_artist_counts(songs):
artistcount = songs.groupby(['artist_name'],as_index=False)['artist_name'].agg({'artist_count':'count'})
songs = songs.merge(artistcount, on = 'artist_name', how = 'left')
return songs
# add composer played counts
## Count the number of times the composer has been listened
def add_composer_counts(songs):
composercount = songs.groupby(['composer'],as_index=False)['composer'].agg({'composer_count':'count'})
songs = songs.merge(composercount, on = 'composer', how = 'left')
return songs
#add lyricist played counts
# Count the number of times the lyricist has been listened
def add_lyricist_counts(songs):
lyricistcount = songs.groupby(['lyricist'],as_index=False)['lyricist'].agg({'lyricist_count':'count'})
songs = songs.merge(lyricistcount, on = 'lyricist', how = 'left')
return songs
# add genre type counts 'popular genres'
# 分组要使用first genre type属性
def add_genere_counts(songs):
genrecount = songs.groupby(['first_genre_type'],as_index = False)['first_genre_type'].agg({'first_genre_typecount':'count'})
songs = songs.merge(genrecount, on = 'first_genre_type', how = 'left')
return songs
# add feat feature
def add_feat_feature(songs):
songs['featured_song'] = songs.apply(lambda attribute : 1 if 'feat' in str(attribute['artist_name'])else 0,
axis = 1)
return songs
# add if_artist_composer_lyricist_are_same feature:
def add_is_same_feature(songs):
songs['same_c_l'] = songs.apply(lambda attribute : 1 if attribute['composer'] == attribute['lyricist'] else 0,
axis = 1 )
songs['all_same'] = songs.apply(lambda attribute : 1 if attribute['artist_name'] == attribute['composer'] and
attribute['composer'] == attribute['lyricist'] else 0, axis = 1)
return songs
#deal with missing values
def songs_language_to_str(language):
if language == -1.0 or np.isnan(language):
return np.nan
elif type(language) == float:
return str(int(language))
else:
return np.nan
def missing_value(songs):
#deal with song_length missing value
songs.song_length.fillna(songs.song_length.mean(), inplace = True)
# deal with language missing value
songs['song_language'] = songs['language'].apply(songs_language_to_str)
return songs
#delete unuseful features
def del_unuseful_feature(songs):
songs = songs.drop('isrc', axis = 1)
songs = songs.drop('name', axis = 1)
songs = songs.drop('genre_ids', axis = 1)
songs = songs.drop(['artist_name', 'composer', 'lyricist'], axis = 1)
songs = songs.drop('language', axis = 1)
return songs
# finish song features extraction
\ No newline at end of file
import pickle
def save(obj, filename):
with open(filename, 'wb') as output: # Overwrites any existing file.
pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
return 1
##-- Load obj from file
def load(filename):
with open(filename, 'rb') as input:
obj = pickle.load(input)
return obj
\ No newline at end of file
from tfn import TRAIN_FILE, TEST_FILE, MEMBERS_FILE, SONGS_FILE, SONGS_EXTRA_FILE
import numpy as np
import pandas as pd
import datetime
import time
import re
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
def _get_training_data_from_csv():
songs = pd.read_csv(TRAIN_FILE, header=0)
test = pd.read_csv(TEST_FILE, header=0)
members = pd.read_csv(MEMBERS_FILE, header=0)
songs = pd.read_csv(SONGS_FILE,, header=0)
songs_extra = pd.read_csv(SONGS_EXTRA_FILE, header=0)
return songs, test, members, songs, songs_extra
if __name__ == '__main__':
songs, test, members, songs, songs_extra = _get_training_data_from_csv()
songs0 = songs
songs_extra0 = songs_extra
songs = songs0.head(1048575)
songs_extra = songs_extra0.head(1048575)
del songs0
del songs_extra0
# merge songinfo file
songs = songs.merge(songs_extra, on = 'song_id', how = 'left')
del songs_extra
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment