kaggle TMDB Box Office Prediction

点这传送kaggle原作者
点这传送数据源&比赛

首先是常规的读数

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from wordcloud import WordCloud, STOPWORDS
from collections import OrderedDict

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
data = pd.read_csv("../input/train.csv")
test=pd.read_csv("../input/test.csv")
'''EDA'''
#check top 5 rows
data.head()
data.info()

变量处理

imdb_id,poster_path

扔掉一看就没用的IMDB ID和海报地址

#first removing features which are irrelevant for our prediction
data.drop(['imdb_id','poster_path'],axis=1,inplace=True)
test.drop(['imdb_id','poster_path'],axis=1,inplace=True)

homepage,collections

68%的电影没有主页,将有无主页处理为二分类变量,画图比较一下,好像没啥太大区别,扔掉它(其实还是有一点的样子)
对是否系列的处理类似

#we have a lot of null values for homepage
#Converting homepage as binary
data['has_homepage'] = 0
data.loc[data['homepage'].isnull() == False, 'has_homepage'] = 1
test['has_homepage'] = 0
test.loc[test['homepage'].isnull() == False, 'has_homepage'] = 1

#Homepage v/s Revenue
sns.catplot(x='has_homepage', y='revenue', data=data);
plt.title('Revenue for film with and without homepage');
data=data.drop(['homepage'],axis =1)
test=test.drop(['homepage'],axis =1)

kaggle TMDB Box Office Prediction

#Converting collections as binary
data['collection'] = 0
data.loc[data['belongs_to_collection'].isnull() == False, 'collection'] = 1
test['collection'] = 0
test.loc[test['belongs_to_collection'].isnull() == False, 'collection'] = 1

#collections v/s Revenue
sns.catplot(x='collection', y='revenue', data=data);
plt.title('Revenue for film with and without collection');
#Collection too increaes the revenue
data=data.drop(['belongs_to_collection'],axis =1)
test=test.drop(['belongs_to_collection'],axis =1)

kaggle TMDB Box Office Prediction

Genres

对电影类型的处理
1、统计下各种类型的频率,画个词云图,按频次排个序

# Exploring Genres
# eval:将字符串转为字典/列表
genres = {}
for i in data['genres']:
    if (not (pd.isnull(i))):

        if (eval(i)[0]['name']) not in genres:
            genres[eval(i)[0]['name']] = 1
        else:
            genres[eval(i)[0]['name']] += 1

plt.figure(figsize=(12, 8))
# text = ' '.join([i for j in genres for i in j])
wordcloud = WordCloud(background_color="white", width=1000, height=1000, max_words=10, relative_scaling=0.5,
                      normalize_plurals=False).generate_from_frequencies(genres)

plt.imshow(wordcloud)
plt.title('Top genres')
plt.axis("off")
plt.show()
genres = OrderedDict(genres)
# Drama, Comedy and Thriller are popular genres
# sorted对字典按值排序后按顺序输入OrderedDict 得到有顺序的字典
OrderedDict(sorted(genres.items(), key=lambda t: t[1]))

kaggle TMDB Box Office Prediction

OrderedDict([('TV Movie', 1),
             ('Foreign', 2),
             ('Western', 13),
             ('History', 16),
             ('War', 20),
             ('Music', 20),
             ('Mystery', 33),
             ('Family', 36),
             ('Science Fiction', 41),
             ('Romance', 67),
             ('Fantasy', 68),
             ('Documentary', 71),
             ('Animation', 76),
             ('Thriller', 116),
             ('Crime', 147),
             ('Horror', 170),
             ('Adventure', 187),
             ('Action', 520),
             ('Comedy', 604),
             ('Drama', 785)])

统计每个样本包含几种类型的电影后生成新变量genres_count

#adding number of genres for each movie
genres_count=[]
for i in data['genres']:
    if(not(pd.isnull(i))):
        
        genres_count.append(len(eval(i)))
        
    else:
        genres_count.append(0)
data['num_genres'] = genres_count

画个图看看新变量跟票房的关系

#Genres v/s revenue
sns.catplot(x='num_genres', y='revenue', data=data);
plt.title('Revenue for different number of genres in the film');

kaggle TMDB Box Office Prediction
给测试数据也安排上

#Adding genres count for test data
genres_count_test=[]
for i in test['genres']:
    if(not(pd.isnull(i))):
        
        genres_count_test.append(len(eval(i)))
        
    else:
        genres_count_test.append(0)
test['num_genres'] = genres_count_test

然后类型也给扔了,想不同

#Dropping genres
data.drop(['genres'],axis=1, inplace = True)
test.drop(['genres'],axis=1, inplace = True)

Production companies

也记个数生成新变量扔掉

#Production companies
#Adding production_companies count for  data
prod_comp_count=[]
for i in data['production_companies']:
    if(not(pd.isnull(i))):
        
        prod_comp_count.append(len(eval(i)))
        
    else:
        prod_comp_count.append(0)
data['num_prod_companies'] = prod_comp_count
#Dropping production_companies
data.drop(['production_companies'],axis=1, inplace = True)
test.drop(['production_companies'],axis=1, inplace = True)

production_countries

国家也计数

production_countries
#Adding production_countries count for  data
prod_coun_count=[]
for i in data['production_countries']:
    if(not(pd.isnull(i))):
        
        prod_coun_count.append(len(eval(i)))
        
    else:
        prod_coun_count.append(0)
data['num_prod_countries'] = prod_coun_count
#number of prod countries vs revenue
sns.catplot(x='num_prod_countries', y='revenue', data=data);
plt.title('Revenue for different number of production countries in the film');

kaggle TMDB Box Office Prediction

#Adding production_countries count for  test data
prod_coun_count_test=[]
for i in test['production_countries']:
    if(not(pd.isnull(i))):
        
        prod_coun_count_test.append(len(eval(i)))
        
    else:
        prod_coun_count_test.append(0)
test['num_prod_countries'] = prod_coun_count_test
#Dropping production_countries
data.drop(['production_countries'],axis=1, inplace = True)
test.drop(['production_countries'],axis=1, inplace = True)

其他处理类似

训练模型

讲预算折换成100制

#normalizing budget
a, b = 1, 100
m, n = data.budget.min(), data.budget.max()
data['budget'] = (data.budget - m) / (n - m) * (b - a) + a

Traning the model

y= data['revenue'].values
cols = [col for col in data.columns if col not in ['revenue', 'id']]
X= data[cols].values
```·
## model 1 - linear Regression
```py
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
#交叉验证 10次 损失函数 平方差
scores = cross_val_score(clf, X, y, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
print(rmse_scores.mean())

81680551.68067782

Model 2 - Random forest regression

from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=2, random_state=0,n_estimators=100)
scores = cross_val_score(regr, X, y, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
print(rmse_scores.mean())