kaggle TMDB Box Office Prediction
首先是常规的读数
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from wordcloud import WordCloud, STOPWORDS
from collections import OrderedDict
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
data = pd.read_csv("../input/train.csv")
test=pd.read_csv("../input/test.csv")
'''EDA'''
#check top 5 rows
data.head()
data.info()
变量处理
imdb_id,poster_path
扔掉一看就没用的IMDB ID和海报地址
#first removing features which are irrelevant for our prediction
data.drop(['imdb_id','poster_path'],axis=1,inplace=True)
test.drop(['imdb_id','poster_path'],axis=1,inplace=True)
homepage,collections
68%的电影没有主页,将有无主页处理为二分类变量,画图比较一下,好像没啥太大区别,扔掉它(其实还是有一点的样子)
对是否系列的处理类似
#we have a lot of null values for homepage
#Converting homepage as binary
data['has_homepage'] = 0
data.loc[data['homepage'].isnull() == False, 'has_homepage'] = 1
test['has_homepage'] = 0
test.loc[test['homepage'].isnull() == False, 'has_homepage'] = 1
#Homepage v/s Revenue
sns.catplot(x='has_homepage', y='revenue', data=data);
plt.title('Revenue for film with and without homepage');
data=data.drop(['homepage'],axis =1)
test=test.drop(['homepage'],axis =1)
#Converting collections as binary
data['collection'] = 0
data.loc[data['belongs_to_collection'].isnull() == False, 'collection'] = 1
test['collection'] = 0
test.loc[test['belongs_to_collection'].isnull() == False, 'collection'] = 1
#collections v/s Revenue
sns.catplot(x='collection', y='revenue', data=data);
plt.title('Revenue for film with and without collection');
#Collection too increaes the revenue
data=data.drop(['belongs_to_collection'],axis =1)
test=test.drop(['belongs_to_collection'],axis =1)
Genres
对电影类型的处理
1、统计下各种类型的频率,画个词云图,按频次排个序
# Exploring Genres
# eval:将字符串转为字典/列表
genres = {}
for i in data['genres']:
if (not (pd.isnull(i))):
if (eval(i)[0]['name']) not in genres:
genres[eval(i)[0]['name']] = 1
else:
genres[eval(i)[0]['name']] += 1
plt.figure(figsize=(12, 8))
# text = ' '.join([i for j in genres for i in j])
wordcloud = WordCloud(background_color="white", width=1000, height=1000, max_words=10, relative_scaling=0.5,
normalize_plurals=False).generate_from_frequencies(genres)
plt.imshow(wordcloud)
plt.title('Top genres')
plt.axis("off")
plt.show()
genres = OrderedDict(genres)
# Drama, Comedy and Thriller are popular genres
# sorted对字典按值排序后按顺序输入OrderedDict 得到有顺序的字典
OrderedDict(sorted(genres.items(), key=lambda t: t[1]))
OrderedDict([('TV Movie', 1),
('Foreign', 2),
('Western', 13),
('History', 16),
('War', 20),
('Music', 20),
('Mystery', 33),
('Family', 36),
('Science Fiction', 41),
('Romance', 67),
('Fantasy', 68),
('Documentary', 71),
('Animation', 76),
('Thriller', 116),
('Crime', 147),
('Horror', 170),
('Adventure', 187),
('Action', 520),
('Comedy', 604),
('Drama', 785)])
统计每个样本包含几种类型的电影后生成新变量genres_count
#adding number of genres for each movie
genres_count=[]
for i in data['genres']:
if(not(pd.isnull(i))):
genres_count.append(len(eval(i)))
else:
genres_count.append(0)
data['num_genres'] = genres_count
画个图看看新变量跟票房的关系
#Genres v/s revenue
sns.catplot(x='num_genres', y='revenue', data=data);
plt.title('Revenue for different number of genres in the film');
给测试数据也安排上
#Adding genres count for test data
genres_count_test=[]
for i in test['genres']:
if(not(pd.isnull(i))):
genres_count_test.append(len(eval(i)))
else:
genres_count_test.append(0)
test['num_genres'] = genres_count_test
然后类型也给扔了,想不同
#Dropping genres
data.drop(['genres'],axis=1, inplace = True)
test.drop(['genres'],axis=1, inplace = True)
Production companies
也记个数生成新变量扔掉
#Production companies
#Adding production_companies count for data
prod_comp_count=[]
for i in data['production_companies']:
if(not(pd.isnull(i))):
prod_comp_count.append(len(eval(i)))
else:
prod_comp_count.append(0)
data['num_prod_companies'] = prod_comp_count
#Dropping production_companies
data.drop(['production_companies'],axis=1, inplace = True)
test.drop(['production_companies'],axis=1, inplace = True)
production_countries
国家也计数
production_countries
#Adding production_countries count for data
prod_coun_count=[]
for i in data['production_countries']:
if(not(pd.isnull(i))):
prod_coun_count.append(len(eval(i)))
else:
prod_coun_count.append(0)
data['num_prod_countries'] = prod_coun_count
#number of prod countries vs revenue
sns.catplot(x='num_prod_countries', y='revenue', data=data);
plt.title('Revenue for different number of production countries in the film');
#Adding production_countries count for test data
prod_coun_count_test=[]
for i in test['production_countries']:
if(not(pd.isnull(i))):
prod_coun_count_test.append(len(eval(i)))
else:
prod_coun_count_test.append(0)
test['num_prod_countries'] = prod_coun_count_test
#Dropping production_countries
data.drop(['production_countries'],axis=1, inplace = True)
test.drop(['production_countries'],axis=1, inplace = True)
其他处理类似
训练模型
讲预算折换成100制
#normalizing budget
a, b = 1, 100
m, n = data.budget.min(), data.budget.max()
data['budget'] = (data.budget - m) / (n - m) * (b - a) + a
Traning the model
y= data['revenue'].values
cols = [col for col in data.columns if col not in ['revenue', 'id']]
X= data[cols].values
```·
## model 1 - linear Regression
```py
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
#交叉验证 10次 损失函数 平方差
scores = cross_val_score(clf, X, y, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
print(rmse_scores.mean())
81680551.68067782
Model 2 - Random forest regression
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=2, random_state=0,n_estimators=100)
scores = cross_val_score(regr, X, y, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
print(rmse_scores.mean())