Prediction of Winning / Elimination Week¶

Import Dependencies

# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import statsmodels.api as sm

import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

# special matplotlib argument for improved plots
from matplotlib import rcParams


import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import requests
import json
import math

from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import *

from sklearn.metrics import roc_curve, auc

# import helper commands
from helpers import *

Import Data¶

#Import all data sources

#Data for each contestant broken down by seasons
with open('tempdata/seasonsDict.json') as json_file:
    seasonsDict = json.load(json_file)

#Twitter
with open("twitter_sentiment.json") as json_file:
    twitter = json.load(json_file)
    
#Distances
with open("distances.json") as json_file:
    distances = json.load(json_file)
    
#Geo-Cluster
with open("geo_cluster.json") as json_file:
    geo_cluster = json.load(json_file)
    
#Profession
with open("profession_dict.json") as json_file:
    professions = json.load(json_file)
    
#Wiki Info
wiki = pd.read_csv("contestantDF.csv")

#Date Guide
date_guide = pd.read_csv("date_guide.csv")

# PCA data 
with open('pca_dict.json') as json_file:
    raw_pca_data = json.load(json_file)

pca_data = {}
for sn in distances:
    pca_data[sn] = {}
    for contestant in distances[sn]:
        for c in raw_pca_data[sn]:
            if c in contestant:
                pca_data[sn][contestant] = raw_pca_data[sn][c]

#Competition Data
with open("competition_data.json") as json_file:
    comp = json.load(json_file)

Aggregate Data to be Used in DF¶

# Organize all the data into a single list
contestants = []
for sn in seasonsDict:
    if int(sn) > 12: # only add data from seasons 13 and on
        for contestant in seasonsDict[sn]:
            cname = contestant['name']
            # add twitter data to contestants info
            foundTwitterName = False
            for tname in twitter[sn]:
                if tname in cname:
                    if len(twitter[sn][tname]) > 0:
                        ntweets = []
                        tweetSentiment = []
                        for date in twitter[sn][tname]:
                            ntweets.append(twitter[sn][tname][date]['ntweet'])
                            tweetSentiment.append(twitter[sn][tname][date]['sentiment'])
                        contestant['avg_num_tweets'] = np.mean(ntweets)
                        contestant['total_tweets'] = np.sum(ntweets)
                        contestant['avg_tweet_sentiment'] = np.mean(tweetSentiment)
                        contestant['total_tweet_sentiment'] = np.sum(tweetSentiment)
                    else:
                        contestant['avg_num_tweets'] = 0
                        contestant['total_tweets'] =0
                        contestant['avg_tweet_sentiment'] = 0
                        contestant['total_tweet_sentiment'] = 0
            
            if cname in distances[sn]:
                # add distance data 
                contestant['distance'] = distances[sn][cname]

                # add profession data

                contestant['profession'] = professions[sn][cname]

                # add geo cluster data 
                contestant['geo_cluster'] = geo_cluster[sn][cname]
                if cname in pca_data[sn]:
                    contestant['pc1'], contestant['pc2'] = pca_data[sn][cname]
                
                contestants.append(contestant)

Generate DF for All Data¶

# Create data frame from the contestants dictionary
df = pd.DataFrame(contestants)

# get rid of an samples that contain NaN
#df = df.dropna()
# just in case, we drop any duplicate samples 
df = df.drop_duplicates()
# get rid of the samples for bachelors
df = df[df.elimination != 'unknown']
# create a column for 'won' to indicate whether the contestant won or did not 
df['won'] = (df.elimination == 'Winner')*1
df['runner_up'] = (df.elimination == 'Runner-up')*1
# winners and runners up are currently strings. We need to turn them into integers
df['elimination'] = df['elimination'].apply(change_winner_runnerup)
df['elimination'] = df['elimination'].astype('int')
# create a dataframe where the unecessary columns are droped
df_temp = df.drop(['hometown','name','occupation','season','pc1','pc2'],1)
df_temp.head(10)

Get the shape of data¶

df_temp.shape

(188, 15)

Get the summary statistics for each of the columns¶

df_temp.describe()

Look at any correlations between variables.¶

We particularly look at the correlation between dependent variables and the elimination week for each contestant as well as whether that contestant won the entire competition

df.corr().won

avg_num_tweets                 0.220814
avg_tweet_sentiment            0.007494
distance                      -0.043893
elimination                    0.440292
geo_cluster                    0.009272
group_dates                    0.181611
individual_dates               0.267352
pc1                           -0.200772
pc2                           -0.103522
profession                    -0.014954
roses_from_group_dates         0.170485
roses_from_individual_dates    0.276920
season                        -0.007815
total_tweet_sentiment         -0.014137
total_tweets                   0.410360
won                            1.000000
runner_up                     -0.038674
Name: won, dtype: float64

We look primarily at those variables that are correlated with winning the entire competition and those correleated with a higher elimination week.

Those variables with a higher positive correlation with winning the entire competition where average number of tweets, the number of total individual and group dates as well as the number of roses acquired from these dates.

df.corr().elimination

avg_num_tweets                 0.611631
avg_tweet_sentiment           -0.020639
distance                      -0.084877
elimination                    1.000000
geo_cluster                    0.006188
group_dates                    0.742743
individual_dates               0.673995
pc1                           -0.053224
pc2                           -0.195776
profession                     0.031437
roses_from_group_dates         0.525342
roses_from_individual_dates    0.657872
season                         0.113756
total_tweet_sentiment         -0.023664
total_tweets                   0.803922
won                            0.440292
runner_up                      0.369544
Name: elimination, dtype: float64

Those variables with a higer positive correlation with higher elimination weeks were the same as those variables that correlated with winning the entire competition. The correlation between these dependent variables and the elimination week, howerver, were almost three times greater than the correlation between the same dependent variables and winning the competition.

Correlation between average number of tweets and elimination week¶

sns.regplot(y="elimination", x="avg_num_tweets", data=df, fit_reg = True)

<matplotlib.axes._subplots.AxesSubplot at 0x10a3050d0>

/Users/hopemason/anaconda/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

sns.regplot(y="elimination", x="roses_from_individual_dates", data=df, fit_reg = True)

<matplotlib.axes._subplots.AxesSubplot at 0x10b289290>

Looking at the Correlation between Dependent Variables¶

We attempt to vizualize the correlation between dependent variables that are positively correlated with the independent variables

# Plot histogram of number of tweets broken up by individual dates
g = sns.FacetGrid(col="individual_dates", data=df, size=8)
g.map(plt.hist, "avg_num_tweets")

<seaborn.axisgrid.FacetGrid at 0x10b5f3710>

There is a higher distribution of tweets for those individuals that received individual dates.

# Plot histogram of number of tweets broken up by individual dates
g = sns.FacetGrid(col="roses_from_individual_dates", data=df, size=8)
g.map(plt.hist, "avg_num_tweets")

<seaborn.axisgrid.FacetGrid at 0x10b6376d0>

Likewise, there is a higher distribution of tweets for those that candidates that received a rose from individual dates.

# Plot histogram of number of tweets broken up by 
g = sns.FacetGrid(col="roses_from_individual_dates", data=df, size=8)
g.map(plt.hist, "avg_tweet_sentiment")

<seaborn.axisgrid.FacetGrid at 0x10d1ce150>

We also look to see if roses from individual dates had a higher distribution of contestants that had higher avgerage twitter sentiment. Theres a relatively higher distribution of contestants who received roses from individual dates and received had higher twitter sentiments.

Run Logistic Regression on Data¶

We want to run a logistics regression to determine the probability of a contestant winning the competition.

# fit logistic regression model
import statsmodels.api as sm
from statsmodels.formula.api import logit, glm, ols

logit_model = logit('won ~ avg_num_tweets + roses_from_individual_dates',df).fit()
logit_model.summary()

Optimization terminated successfully.
         Current function value: 0.123483
         Iterations 9

The only dependent variables that were statistically significant were average number of tweets as well as the number roses a contestant received from individual dates. Consequently, our R squared is very low because of the small number of dependent variable used in the regression.

Run a Linear Regression on Data¶

We want to run a linear regression to estimate a contestants elimination week

model = ols('elimination ~ avg_num_tweets + roses_from_individual_dates + roses_from_group_dates + pc2 ',df).fit()
model.summary()

The linear regression provides a better model for estimating the episode that a contestant will be eliminated. In addition to average number of tweets and roses from individual dataes, roses from group dates and the principal component 2 were aslo statistically significant.

features = list(model.params.keys())
coefs = list(model.params.values)
plt.bar(range(len(features)),coefs, align="center", width=.5)
plt.xticks(range(len(features)), features, rotation=45)
plt.ylabel("Regression Coefficients")
plt.title("Regression Results")
plt.show()

Breakdown Data by Season¶

We want to breakdown the data by season to account for the differences in the number of tweets from each season, since the average number of tweets about all contestants should increase from year to year due to twitter's broader addoption.

seasons_dfs = {}
for sn in range(13,20):
    seasons_dfs[sn] = df_temp[df['season'] == sn]

seasons_dfs[14].head()

seasons_dfs[14].corr().won

avg_num_tweets                 0.454344
avg_tweet_sentiment           -0.453450
distance                      -0.024731
elimination                    0.554021
geo_cluster                   -0.082479
group_dates                    0.374192
individual_dates               0.302849
profession                    -0.223972
roses_from_group_dates        -0.060193
roses_from_individual_dates    0.302849
total_tweet_sentiment         -0.849981
total_tweets                   0.628029
won                            1.000000
runner_up                     -0.041667
Name: won, dtype: float64

It is interesting to note that average and total tweet sentiment have high negative correlations with winning the competition (-0.400540 and -0.849981, respectively), while the average and total number of tweets maintains a high positive correlation with winning the competition (0.454344 and 0.628029, respectively).

seasons_dfs[14].corr().elimination

avg_num_tweets                 0.726037
avg_tweet_sentiment           -0.376984
distance                       0.402683
elimination                    1.000000
geo_cluster                   -0.170559
group_dates                    0.661066
individual_dates               0.501505
profession                    -0.176804
roses_from_group_dates         0.276867
roses_from_individual_dates    0.501505
total_tweet_sentiment         -0.583122
total_tweets                   0.881673
won                            0.554021
runner_up                      0.473494
Name: elimination, dtype: float64

The same phenomena are true when looking at these dependent variables and a contestant's elimination week.

sns.regplot(y="elimination", x="avg_tweet_sentiment", data=seasons_dfs[14], fit_reg = True)

<matplotlib.axes._subplots.AxesSubplot at 0x10ebe9850>

sns.regplot(y="elimination", x="avg_num_tweets", data=seasons_dfs[14], fit_reg = True)

<matplotlib.axes._subplots.AxesSubplot at 0x10ed60910>

	age	avg_num_tweets	avg_tweet_sentiment	distance	elimination	geo_cluster	group_dates	individual_dates	profession	roses_from_group_dates	roses_from_individual_dates	total_tweet_sentiment	total_tweets	won	runner_up
0	25	13.285714	0.179650	1025.064422	10	5	2	1	5	0	1	1.257547	93	1	0
1	24	10.285714	0.071815	226.889927	9	4	1	1	3	1	1	0.502707	72	0	1
2	29	5.333333	-0.073958	1879.977325	7	7	2	1	2	0	0	-0.443750	32	0	0
3	24	1.200000	0.005556	2026.201031	6	6	3	0	5	1	0	0.027778	6	0	0
4	34	4.000000	0.062500	538.679446	5	3	0	2	3	0	2	0.250000	16	0	0
5	27	0.666667	-0.166667	397.346118	4	2	2	0	2	0	0	-0.500000	2	0	0
6	25	2.666667	-0.066667	103.284845	4	1	2	0	3	0	0	-0.200000	8	0	0
7	29	2.000000	0.166667	627.494033	4	5	2	0	3	0	0	0.500000	6	0	0
8	29	0.000000	0.000000	309.413472	4	4	0	0	3	0	0	0.000000	0	0	0
9	25	0.500000	0.000000	439.699607	3	1	2	0	1	0	0	0.000000	1	0	0

	avg_num_tweets	avg_tweet_sentiment	distance	elimination	geo_cluster	group_dates	individual_dates	profession	roses_from_group_dates	roses_from_individual_dates	total_tweet_sentiment	total_tweets	won	runner_up
count	187.000000	187.000000	188.000000	188.000000	188.000000	188.000000	188.000000	188.000000	188.000000	188.000000	187.000000	187.000000	188.000000	188.000000
mean	69.345355	-0.004858	1167.517340	3.776596	3.909574	1.446809	0.292553	8.367021	0.175532	0.276596	-0.015337	322.203209	0.037234	0.037234
std	77.579937	0.056251	781.632641	2.787114	1.922931	1.531257	0.521770	72.654838	0.457892	0.515104	0.224103	460.825955	0.189840	0.189840
min	0.000000	-0.232394	86.282895	1.000000	1.000000	0.000000	0.000000	1.000000	0.000000	0.000000	-1.311356	0.000000	0.000000	0.000000
25%	0.000000	-0.019816	660.512782	1.000000	2.000000	0.000000	0.000000	2.000000	0.000000	0.000000	-0.068591	0.000000	0.000000	0.000000
50%	21.000000	0.000000	1054.494144	3.000000	4.000000	1.000000	0.000000	3.000000	0.000000	0.000000	0.000000	65.000000	0.000000	0.000000
75%	151.482143	0.005344	1535.580050	6.000000	6.000000	3.000000	1.000000	5.000000	0.000000	0.000000	0.024874	528.000000	0.000000	0.000000
max	200.000000	0.250000	5323.980053	10.000000	7.000000	5.000000	2.000000	999.000000	3.000000	2.000000	1.257547	1714.000000	1.000000	1.000000

Dep. Variable:	won	No. Observations:	187
Model:	Logit	Df Residuals:	184
Method:	MLE	Df Model:	2
Date:	Thu, 10 Dec 2015	Pseudo R-squ.:	0.2268
Time:	17:42:42	Log-Likelihood:	-23.091
converged:	True	LL-Null:	-29.864
		LLR p-value:	0.001145

	coef	std err	z	P>\|z\|	[95.0% Conf. Int.]
Intercept	-5.5503	1.229	-4.515	0.000	-7.960 -3.141
avg_num_tweets	0.0126	0.007	1.695	0.090	-0.002 0.027
roses_from_individual_dates	1.3700	0.655	2.092	0.036	0.086 2.654

Dep. Variable:	elimination	R-squared:	0.609
Model:	OLS	Adj. R-squared:	0.597
Method:	Least Squares	F-statistic:	54.41
Date:	Thu, 10 Dec 2015	Prob (F-statistic):	1.34e-27
Time:	17:42:42	Log-Likelihood:	-280.49
No. Observations:	145	AIC:	571.0
Df Residuals:	140	BIC:	585.9
Df Model:	4
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Intercept	2.0989	0.187	11.240	0.000	1.730 2.468
avg_num_tweets	0.0118	0.002	5.310	0.000	0.007 0.016
roses_from_individual_dates	1.7241	0.333	5.184	0.000	1.067 2.382
roses_from_group_dates	1.5851	0.357	4.446	0.000	0.880 2.290
pc2	-9.162e-05	4.85e-05	-1.891	0.061	-0.000 4.18e-06

Omnibus:	6.635	Durbin-Watson:	1.181
Prob(Omnibus):	0.036	Jarque-Bera (JB):	9.573
Skew:	0.197	Prob(JB):	0.00834
Kurtosis:	4.195	Cond. No.	8.32e+03

	age	avg_num_tweets	avg_tweet_sentiment	distance	elimination	geo_cluster	group_dates	individual_dates	profession	roses_from_group_dates	roses_from_individual_dates	total_tweet_sentiment	total_tweets	won	runner_up
55	23	151.000000	-0.187337	970.915710	10	3	2	1	1	0	1	-1.311356	1057	1	0
56	25	140.142857	0.006997	1638.689701	9	6	1	0	2	1	0	0.048978	981	0	1
57	26	74.666667	-0.048547	1371.988917	7	1	1	1	5	0	1	-0.291285	448	0	0
58	25	108.400000	-0.012097	1453.479117	6	1	1	1	1	0	1	-0.060484	542	0	0
59	23	12.750000	0.014583	965.750991	5	3	2	0	4	0	0	0.058333	51	0	0