import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from scipy import stats
from scipy import stats, special
from sklearn import model_selection, metrics, linear_model, datasets, feature_selection
from sklearn import neighbors
from sklearn.preprocessing import StandardScaler
import time
from scipy import io
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import LabelEncoder
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import json
import seaborn as sns
import ast
plt.figure(figsize=(8,6))
plt.scatter((train['budget']), (train['revenue']))
plt.title('Revenue vs Budget')
plt.xlabel('Budget')
plt.ylabel('Revenue')
plt.show()
plt.figure(figsize=(8,6))
plt.scatter(np.log10(train['budget']), np.log10(train['revenue']))
plt.title('Revenue vs Budget')
plt.xlabel('Budget [log10]')
plt.ylabel('Revenue [log10]')
plt.show()
plt.figure(figsize=(8,6))
plt.scatter(np.log10(train['popularity']), np.log10(train['revenue']))
plt.title('Revenue vs popularity')
plt.xlabel('Popularity [log]')
plt.ylabel('Revenue [log]')
plt.show()
train.sort_values('budget', ascending=False).head(10).plot(x='original_title', y='budget', kind='barh')
plt.xlabel('Budget [USD]');
train.sort_values('revenue', ascending=False).head(10).plot(x='original_title',
y='revenue', kind='barh')
plt.xlabel('Revenue [USD]');
train.assign(profit = lambda df: df['revenue'] - df['budget'] ).sort_values('profit',
ascending=False).head(10).plot(x='original_title',
y='profit', kind='barh')
plt.xlabel('Profit [USD]');
train.groupby('genres')['revenue'].mean().sort_values().plot(kind='barh')
plt.xlabel('Revenue [USD]');
fig, ax= plt.subplots(figsize=(8,6))
ax.set_yscale('symlog')
sns.boxplot(x= 'collection', y='revenue', data=train, ax=ax);
def parse_json(x):
try:
return json.loads(x.replace("'", '"'))[0]['name']
except:
return ''
train.groupby('production_companies')['revenue'].mean().sort_values(ascending=False).head(20).plot(kind='barh')
plt.xlabel('Revenue [USD]');
# using SentimentIntensityAnalyzer function from the vaderSentiment package
# for an analysis of the sentiment of the films 'overview' and 'tagline'
analyser = SentimentIntensityAnalyzer()
# Fill out the NaNs values in 'overview' and 'tagline'
# with an empty string ('') before processing the analyser scores
train['overview'] = train['overview'].fillna('')
train['tagline'] = train['tagline'].fillna('')
# As we can see from the sentiment analysis, there is (almost) no correlation between
# the 'compound' value generated by vaderSentiment package (a composition sentiment value)
# To the 'overview' and 'tagline' columns.
train[['tag_sentiment', 'sentiment']].corrwith(train['revenue'])
# Helper function to parse text and convert given strings to lists
def text_to_list(x):
if pd.isna(x):
return ''
else:
return ast.literal_eval(x)
combined = pd.concat((train, test), sort=False)
combined.drop(columns=['id','imdb_id', 'poster_path', 'title', 'original_title'], inplace=True)
for col in ['genres', 'production_companies', 'production_countries', \
'spoken_languages', 'Keywords', 'cast', 'crew']:
combined[col] = combined[col].apply(text_to_list)
combined['tagline'] = 1*(~combined['tagline'].isna())
combined['homepage'] = 1*(~combined['homepage'].isna())
# New feature includes the number of characters in each movie's overview
combined['overview'] = combined['overview'].str.len()
# Any movie without an overview (Nan) will set to zero
combined['overview'].fillna(0, inplace=True)
def parse_genre(x):
if type(x) == str:
return pd.Series(['','',''], index=['genres1', 'genres2', 'genres3'] )
if len(x) == 1:
return pd.Series([x[0]['name'],'',''], index=['genres1', 'genres2', 'genres3'] )
if len(x) == 2:
return pd.Series([x[0]['name'],x[1]['name'],''], index=['genres1', 'genres2', 'genres3'] )
if len(x) > 2:
return pd.Series([x[0]['name'],x[1]['name'],x[2]['name']], index=['genres1', 'genres2', 'genres3'] )
combined['production_company_number'] = \
combined['production_companies'].apply(lambda x: len(x))
def parse_production_companies(x):
if type(x) == str:
return pd.Series(['','',''], index=['prod1', 'prod2', 'prod3'] )
if len(x) == 1:
return pd.Series([x[0]['name'],'',''], index=['prod1', 'prod2', 'prod3'] )
if len(x) == 2:
return pd.Series([x[0]['name'],x[1]['name'],''], index=['prod1', 'prod2', 'prod3'] )
if len(x) > 2:
return pd.Series([x[0]['name'],x[1]['name'],x[2]['name']], index=['prod1', 'prod2', 'prod3'] )
combined['production_country_number'] = \
combined['production_countries'].apply(lambda x: len(x))
def parse_production_countries(x):
if type(x) == str:
return pd.Series(['','',''], index=['country1', 'country2', 'country3'] )
if len(x) == 1:
return pd.Series([x[0]['name'],'',''], index=['country1', 'country2', 'country3'] )
if len(x) == 2:
return pd.Series([x[0]['name'],x[1]['name'],''], index=['country1', 'country2', 'country3'] )
if len(x) > 2:
return pd.Series([x[0]['name'],x[1]['name'],x[2]['name']], index=['country1', 'country2', 'country3'] )
combined[['country1', 'country2', 'country3']] = \
combined['production_countries'].apply(parse_production_countries)
combined.drop(columns='production_countries', inplace=True)
# Parse and break-down the date column ('release_date' column)
combined['release_date'] = pd.to_datetime(combined['release_date'], format='%m/%d/%y')
# Parse 'weekday'
combined['weekday'] = combined['release_date'].dt.weekday
# fill Nan in 'weekday' column with the most common weekday value - 4
combined['weekday'].fillna(4, inplace=True)
# Parse 'month'
combined['month'] = combined['release_date'].dt.month
# fill Nan in 'month' with the most common month value - 9
combined['month'].fillna(9, inplace=True)
# Parse 'year'
combined['year'] = combined['release_date'].dt.year
# fill Nan in 'year' with the median value of the 'year' column
combined['year'].fillna(combined['year'].median(), inplace=True)
# Parse 'day'
combined['day'] = combined['release_date'].dt.day
# fill Nan with the most common day value - 1
combined['day'].fillna(1, inplace=True)
# Drop the original 'release_date' column
combined.drop(columns =['release_date'], inplace=True)
combined['runtime'].fillna(combined['runtime'].median(), inplace=True)
combined['spoken_languages_number'] = \
combined['spoken_languages'].apply(lambda x: len(x))
def parse_spoken_languages(x):
if type(x) == str:
return pd.Series(['','',''], index=['lang1', 'lang2', 'lang3'])
if len(x) == 1:
return pd.Series([x[0]['name'],'',''], index=['lang1', 'lang2', 'lang3'])
if len(x) == 2:
return pd.Series([x[0]['name'],x[1]['name'],''], index=['lang1', 'lang2', 'lang3'])
if len(x) > 2:
return pd.Series([x[0]['name'],x[1]['name'],x[2]['name']], index=['lang1', 'lang2', 'lang3'])
combined[['lang1', 'lang2', 'lang3']] = \
combined['spoken_languages'].apply(parse_spoken_languages)
combined.drop(columns='spoken_languages', inplace=True)
combined['status'].fillna('Released', inplace=True)
combined['keywords_number'] = \
combined['Keywords'].apply(lambda x: len(x))
def parse_keywords(x):
if type(x) == str:
return pd.Series(['','',''], index=['key1', 'key2', 'key3'])
if len(x) == 1:
return pd.Series([x[0]['name'],'',''], index=['key1', 'key2', 'key3'])
if len(x) == 2:
return pd.Series([x[0]['name'],x[1]['name'],''], index=['key1', 'key2', 'key3'])
if len(x) > 2:
return pd.Series([x[0]['name'],x[1]['name'],x[2]['name']], index=['key1', 'key2', 'key3'])
combined[['key1', 'key2', 'key3']] = \
combined['Keywords'].apply(parse_keywords)
combined.drop(columns='Keywords', inplace=True)
combined['gender_0_number'] = combined['cast'].apply(lambda row: sum([x['gender'] == 0 for x in row]))
combined['gender_1_number'] = combined['cast'].apply(lambda row: sum([x['gender'] == 1 for x in row]))
combined['gender_2_number'] = combined['cast'].apply(lambda row: sum([x['gender'] == 2 for x in row]))
combined['cast_number'] = \
combined['cast'].apply(lambda x: len(x))
def parse_cast(x):
myindx = ['cast1', 'cast2', 'cast3', 'cast4', 'cast5']
out = [-1]*5
if type(x) != str:
for i in range(min([5,len(x)])):
out[i] = x[i]['id']
return pd.Series(out, index=myindx)
combined[['cast1', 'cast2', 'cast3', 'cast4', 'cast5']] = combined['cast'].apply(parse_cast)
combined.drop(columns='cast', inplace=True)
combined['crew_number'] = \
combined['crew'].apply(lambda x: len(x))
def parse_crew(x):
myindx = ['Director', 'Producer']
out = [-1]*2
if type(x) != str:
for item in x:
if item['job'] == 'Director':
out[0] = item['id']
elif item['job'] == 'Producer':
out[1] = item['id']
return pd.Series(out, index=myindx)
combined[['Director', 'Producer']] = combined['crew'].apply(parse_crew)
combined.drop(columns='crew', inplace=True)
combined['budget_log'] = np.log1p(combined['budget'])
combined['pop_log'] = np.log1p(combined['popularity'])
cols = ['genres1', 'genres2', 'genres3']
allitems = list(set(combined[cols].values.ravel().tolist()))
labeler = LabelEncoder()
labeler.fit(allitems)
combined[cols] = combined[cols].apply(lambda x: labeler.transform(x))
cols = ['prod1', 'prod2', 'prod3']
allitems = list(set(combined[cols].values.ravel().tolist()))
labeler = LabelEncoder()
labeler.fit(allitems)
combined[cols] = combined[cols].apply(lambda x: labeler.transform(x))
cols = ['country1', 'country2', 'country3']
allitems = list(set(combined[cols].values.ravel().tolist()))
labeler = LabelEncoder()
labeler.fit(allitems)
combined[cols] = combined[cols].apply(lambda x: labeler.transform(x))
cols = ['lang1', 'lang2', 'lang3']
allitems = list(set(combined[cols].values.ravel().tolist()))
labeler = LabelEncoder()
labeler.fit(allitems)
combined[cols] = combined[cols].apply(lambda x: labeler.transform(x))
cols = ['key1', 'key2', 'key3']
allitems = list(set(combined[cols].values.ravel().tolist()))
labeler = LabelEncoder()
labeler.fit(allitems)
combined[cols] = combined[cols].apply(lambda x: labeler.transform(x))
combined_dummy = combined.copy()
cat_col = combined.select_dtypes('object').columns
combined_dummy[cat_col] = combined_dummy[cat_col].apply(lambda x: LabelEncoder().fit_transform(x))
train_data = combined_dummy.iloc[:ntrain]
test_data = combined_dummy.iloc[-ntest:]
# Drop the 'revenue' column, it is the values to predict
X_train = train_data.drop(columns='revenue').values
# The log transformation of the revenue gives better results, hence, we will use it
y_train = np.log1p(train_data['revenue'].values)
# Drop the 'revenue' column, will be filled at the end when the model will be ready
X_test = test_data.drop(columns='revenue').values
kf = KFold(n_splits=5, shuffle=True, random_state=123)
lr = LinearRegression()
y_pred = cross_val_predict(lr, X_train, y_train, cv=kf)
y_pred[y_pred < 0 ] = 0
rf = RandomForestRegressor(max_depth=20, random_state=123, n_estimators=100)
y_pred = cross_val_predict(rf, X_train, y_train, cv=kf)
y_pred[y_pred < 0 ] = 0
rf.fit(X_train, y_train)
imp = pd.Series(rf.feature_importances_, index=train_data.drop(columns='revenue').columns)
imp.sort_values(ascending=False).plot(kind='barh', figsize=(8,10))
lgb_model = lgb.LGBMRegressor(num_leaves=20, max_depth=-1, learning_rate=0.01,
metrics='rmse', n_estimators=1500, feature_fraction = 0.4)
y_pred = cross_val_predict(lgb_model, X_train, y_train, cv=kf)
lgb_model.fit(X_train, y_train)
imp = pd.Series(lgb_model.feature_importances_, index=train_data.drop(columns='revenue').columns)
imp.sort_values(ascending=False).plot(kind='barh', figsize=(8,10))