Prediction of Winning / Elimination Week

Import Dependencies

In [1]:
# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import statsmodels.api as sm

import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

# special matplotlib argument for improved plots
from matplotlib import rcParams


import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import requests
import json
import math

from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import *

from sklearn.metrics import roc_curve, auc

# import helper commands
from helpers import *

Import Data

In [2]:
#Import all data sources

#Data for each contestant broken down by seasons
with open('tempdata/seasonsDict.json') as json_file:
    seasonsDict = json.load(json_file)

#Twitter
with open("twitter_sentiment.json") as json_file:
    twitter = json.load(json_file)
    
#Distances
with open("distances.json") as json_file:
    distances = json.load(json_file)
    
#Geo-Cluster
with open("geo_cluster.json") as json_file:
    geo_cluster = json.load(json_file)
    
#Profession
with open("profession_dict.json") as json_file:
    professions = json.load(json_file)
    
#Wiki Info
wiki = pd.read_csv("contestantDF.csv")

#Date Guide
date_guide = pd.read_csv("date_guide.csv")

# PCA data 
with open('pca_dict.json') as json_file:
    raw_pca_data = json.load(json_file)

pca_data = {}
for sn in distances:
    pca_data[sn] = {}
    for contestant in distances[sn]:
        for c in raw_pca_data[sn]:
            if c in contestant:
                pca_data[sn][contestant] = raw_pca_data[sn][c]

#Competition Data
with open("competition_data.json") as json_file:
    comp = json.load(json_file)

Aggregate Data to be Used in DF

In [3]:
# Organize all the data into a single list
contestants = []
for sn in seasonsDict:
    if int(sn) > 12: # only add data from seasons 13 and on
        for contestant in seasonsDict[sn]:
            cname = contestant['name']
            # add twitter data to contestants info
            foundTwitterName = False
            for tname in twitter[sn]:
                if tname in cname:
                    if len(twitter[sn][tname]) > 0:
                        ntweets = []
                        tweetSentiment = []
                        for date in twitter[sn][tname]:
                            ntweets.append(twitter[sn][tname][date]['ntweet'])
                            tweetSentiment.append(twitter[sn][tname][date]['sentiment'])
                        contestant['avg_num_tweets'] = np.mean(ntweets)
                        contestant['total_tweets'] = np.sum(ntweets)
                        contestant['avg_tweet_sentiment'] = np.mean(tweetSentiment)
                        contestant['total_tweet_sentiment'] = np.sum(tweetSentiment)
                    else:
                        contestant['avg_num_tweets'] = 0
                        contestant['total_tweets'] =0
                        contestant['avg_tweet_sentiment'] = 0
                        contestant['total_tweet_sentiment'] = 0
            
            if cname in distances[sn]:
                # add distance data 
                contestant['distance'] = distances[sn][cname]

                # add profession data

                contestant['profession'] = professions[sn][cname]

                # add geo cluster data 
                contestant['geo_cluster'] = geo_cluster[sn][cname]
                if cname in pca_data[sn]:
                    contestant['pc1'], contestant['pc2'] = pca_data[sn][cname]
                
                contestants.append(contestant)

Generate DF for All Data

In [4]:
# Create data frame from the contestants dictionary
df = pd.DataFrame(contestants)

# get rid of an samples that contain NaN
#df = df.dropna()
# just in case, we drop any duplicate samples 
df = df.drop_duplicates()
# get rid of the samples for bachelors
df = df[df.elimination != 'unknown']
# create a column for 'won' to indicate whether the contestant won or did not 
df['won'] = (df.elimination == 'Winner')*1
df['runner_up'] = (df.elimination == 'Runner-up')*1
# winners and runners up are currently strings. We need to turn them into integers
df['elimination'] = df['elimination'].apply(change_winner_runnerup)
df['elimination'] = df['elimination'].astype('int')
# create a dataframe where the unecessary columns are droped
df_temp = df.drop(['hometown','name','occupation','season','pc1','pc2'],1)
df_temp.head(10)
Out[4]:
age avg_num_tweets avg_tweet_sentiment distance elimination geo_cluster group_dates individual_dates profession roses_from_group_dates roses_from_individual_dates total_tweet_sentiment total_tweets won runner_up
0 25 13.285714 0.179650 1025.064422 10 5 2 1 5 0 1 1.257547 93 1 0
1 24 10.285714 0.071815 226.889927 9 4 1 1 3 1 1 0.502707 72 0 1
2 29 5.333333 -0.073958 1879.977325 7 7 2 1 2 0 0 -0.443750 32 0 0
3 24 1.200000 0.005556 2026.201031 6 6 3 0 5 1 0 0.027778 6 0 0
4 34 4.000000 0.062500 538.679446 5 3 0 2 3 0 2 0.250000 16 0 0
5 27 0.666667 -0.166667 397.346118 4 2 2 0 2 0 0 -0.500000 2 0 0
6 25 2.666667 -0.066667 103.284845 4 1 2 0 3 0 0 -0.200000 8 0 0
7 29 2.000000 0.166667 627.494033 4 5 2 0 3 0 0 0.500000 6 0 0
8 29 0.000000 0.000000 309.413472 4 4 0 0 3 0 0 0.000000 0 0 0
9 25 0.500000 0.000000 439.699607 3 1 2 0 1 0 0 0.000000 1 0 0

Get the shape of data

In [5]:
df_temp.shape
Out[5]:
(188, 15)

Get the summary statistics for each of the columns

In [6]:
df_temp.describe()
Out[6]:
avg_num_tweets avg_tweet_sentiment distance elimination geo_cluster group_dates individual_dates profession roses_from_group_dates roses_from_individual_dates total_tweet_sentiment total_tweets won runner_up
count 187.000000 187.000000 188.000000 188.000000 188.000000 188.000000 188.000000 188.000000 188.000000 188.000000 187.000000 187.000000 188.000000 188.000000
mean 69.345355 -0.004858 1167.517340 3.776596 3.909574 1.446809 0.292553 8.367021 0.175532 0.276596 -0.015337 322.203209 0.037234 0.037234
std 77.579937 0.056251 781.632641 2.787114 1.922931 1.531257 0.521770 72.654838 0.457892 0.515104 0.224103 460.825955 0.189840 0.189840
min 0.000000 -0.232394 86.282895 1.000000 1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.311356 0.000000 0.000000 0.000000
25% 0.000000 -0.019816 660.512782 1.000000 2.000000 0.000000 0.000000 2.000000 0.000000 0.000000 -0.068591 0.000000 0.000000 0.000000
50% 21.000000 0.000000 1054.494144 3.000000 4.000000 1.000000 0.000000 3.000000 0.000000 0.000000 0.000000 65.000000 0.000000 0.000000
75% 151.482143 0.005344 1535.580050 6.000000 6.000000 3.000000 1.000000 5.000000 0.000000 0.000000 0.024874 528.000000 0.000000 0.000000
max 200.000000 0.250000 5323.980053 10.000000 7.000000 5.000000 2.000000 999.000000 3.000000 2.000000 1.257547 1714.000000 1.000000 1.000000

Look at any correlations between variables.

We particularly look at the correlation between dependent variables and the elimination week for each contestant as well as whether that contestant won the entire competition

In [7]:
df.corr().won
Out[7]:
avg_num_tweets                 0.220814
avg_tweet_sentiment            0.007494
distance                      -0.043893
elimination                    0.440292
geo_cluster                    0.009272
group_dates                    0.181611
individual_dates               0.267352
pc1                           -0.200772
pc2                           -0.103522
profession                    -0.014954
roses_from_group_dates         0.170485
roses_from_individual_dates    0.276920
season                        -0.007815
total_tweet_sentiment         -0.014137
total_tweets                   0.410360
won                            1.000000
runner_up                     -0.038674
Name: won, dtype: float64

We look primarily at those variables that are correlated with winning the entire competition and those correleated with a higher elimination week.

Those variables with a higher positive correlation with winning the entire competition where average number of tweets, the number of total individual and group dates as well as the number of roses acquired from these dates.

In [8]:
df.corr().elimination
Out[8]:
avg_num_tweets                 0.611631
avg_tweet_sentiment           -0.020639
distance                      -0.084877
elimination                    1.000000
geo_cluster                    0.006188
group_dates                    0.742743
individual_dates               0.673995
pc1                           -0.053224
pc2                           -0.195776
profession                     0.031437
roses_from_group_dates         0.525342
roses_from_individual_dates    0.657872
season                         0.113756
total_tweet_sentiment         -0.023664
total_tweets                   0.803922
won                            0.440292
runner_up                      0.369544
Name: elimination, dtype: float64

Those variables with a higer positive correlation with higher elimination weeks were the same as those variables that correlated with winning the entire competition. The correlation between these dependent variables and the elimination week, howerver, were almost three times greater than the correlation between the same dependent variables and winning the competition.

Correlation between average number of tweets and elimination week

In [9]:
sns.regplot(y="elimination", x="avg_num_tweets", data=df, fit_reg = True)
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x10a3050d0>
/Users/hopemason/anaconda/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):
In [10]:
sns.regplot(y="elimination", x="roses_from_individual_dates", data=df, fit_reg = True)
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x10b289290>

Looking at the Correlation between Dependent Variables

We attempt to vizualize the correlation between dependent variables that are positively correlated with the independent variables

In [11]:
# Plot histogram of number of tweets broken up by individual dates
g = sns.FacetGrid(col="individual_dates", data=df, size=8)
g.map(plt.hist, "avg_num_tweets")
Out[11]:
<seaborn.axisgrid.FacetGrid at 0x10b5f3710>

There is a higher distribution of tweets for those individuals that received individual dates.

In [12]:
# Plot histogram of number of tweets broken up by individual dates
g = sns.FacetGrid(col="roses_from_individual_dates", data=df, size=8)
g.map(plt.hist, "avg_num_tweets")
Out[12]:
<seaborn.axisgrid.FacetGrid at 0x10b6376d0>

Likewise, there is a higher distribution of tweets for those that candidates that received a rose from individual dates.

In [13]:
# Plot histogram of number of tweets broken up by 
g = sns.FacetGrid(col="roses_from_individual_dates", data=df, size=8)
g.map(plt.hist, "avg_tweet_sentiment")
Out[13]:
<seaborn.axisgrid.FacetGrid at 0x10d1ce150>

We also look to see if roses from individual dates had a higher distribution of contestants that had higher avgerage twitter sentiment. Theres a relatively higher distribution of contestants who received roses from individual dates and received had higher twitter sentiments.

Run Logistic Regression on Data

We want to run a logistics regression to determine the probability of a contestant winning the competition.

In [14]:
# fit logistic regression model
import statsmodels.api as sm
from statsmodels.formula.api import logit, glm, ols

logit_model = logit('won ~ avg_num_tweets + roses_from_individual_dates',df).fit()
logit_model.summary()
Optimization terminated successfully.
         Current function value: 0.123483
         Iterations 9
Out[14]:
Logit Regression Results
Dep. Variable: won No. Observations: 187
Model: Logit Df Residuals: 184
Method: MLE Df Model: 2
Date: Thu, 10 Dec 2015 Pseudo R-squ.: 0.2268
Time: 17:42:42 Log-Likelihood: -23.091
converged: True LL-Null: -29.864
LLR p-value: 0.001145
coef std err z P>|z| [95.0% Conf. Int.]
Intercept -5.5503 1.229 -4.515 0.000 -7.960 -3.141
avg_num_tweets 0.0126 0.007 1.695 0.090 -0.002 0.027
roses_from_individual_dates 1.3700 0.655 2.092 0.036 0.086 2.654

The only dependent variables that were statistically significant were average number of tweets as well as the number roses a contestant received from individual dates. Consequently, our R squared is very low because of the small number of dependent variable used in the regression.

Run a Linear Regression on Data

We want to run a linear regression to estimate a contestants elimination week

In [15]:
model = ols('elimination ~ avg_num_tweets + roses_from_individual_dates + roses_from_group_dates + pc2 ',df).fit()
model.summary()
Out[15]:
OLS Regression Results
Dep. Variable: elimination R-squared: 0.609
Model: OLS Adj. R-squared: 0.597
Method: Least Squares F-statistic: 54.41
Date: Thu, 10 Dec 2015 Prob (F-statistic): 1.34e-27
Time: 17:42:42 Log-Likelihood: -280.49
No. Observations: 145 AIC: 571.0
Df Residuals: 140 BIC: 585.9
Df Model: 4
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Intercept 2.0989 0.187 11.240 0.000 1.730 2.468
avg_num_tweets 0.0118 0.002 5.310 0.000 0.007 0.016
roses_from_individual_dates 1.7241 0.333 5.184 0.000 1.067 2.382
roses_from_group_dates 1.5851 0.357 4.446 0.000 0.880 2.290
pc2 -9.162e-05 4.85e-05 -1.891 0.061 -0.000 4.18e-06
Omnibus: 6.635 Durbin-Watson: 1.181
Prob(Omnibus): 0.036 Jarque-Bera (JB): 9.573
Skew: 0.197 Prob(JB): 0.00834
Kurtosis: 4.195 Cond. No. 8.32e+03

The linear regression provides a better model for estimating the episode that a contestant will be eliminated. In addition to average number of tweets and roses from individual dataes, roses from group dates and the principal component 2 were aslo statistically significant.

In [16]:
features = list(model.params.keys())
coefs = list(model.params.values)
plt.bar(range(len(features)),coefs, align="center", width=.5)
plt.xticks(range(len(features)), features, rotation=45)
plt.ylabel("Regression Coefficients")
plt.title("Regression Results")
plt.show()

Breakdown Data by Season

We want to breakdown the data by season to account for the differences in the number of tweets from each season, since the average number of tweets about all contestants should increase from year to year due to twitter's broader addoption.

In [17]:
seasons_dfs = {}
for sn in range(13,20):
    seasons_dfs[sn] = df_temp[df['season'] == sn]
In [18]:
seasons_dfs[14].head()
Out[18]:
age avg_num_tweets avg_tweet_sentiment distance elimination geo_cluster group_dates individual_dates profession roses_from_group_dates roses_from_individual_dates total_tweet_sentiment total_tweets won runner_up
55 23 151.000000 -0.187337 970.915710 10 3 2 1 1 0 1 -1.311356 1057 1 0
56 25 140.142857 0.006997 1638.689701 9 6 1 0 2 1 0 0.048978 981 0 1
57 26 74.666667 -0.048547 1371.988917 7 1 1 1 5 0 1 -0.291285 448 0 0
58 25 108.400000 -0.012097 1453.479117 6 1 1 1 1 0 1 -0.060484 542 0 0
59 23 12.750000 0.014583 965.750991 5 3 2 0 4 0 0 0.058333 51 0 0
In [19]:
seasons_dfs[14].corr().won
Out[19]:
avg_num_tweets                 0.454344
avg_tweet_sentiment           -0.453450
distance                      -0.024731
elimination                    0.554021
geo_cluster                   -0.082479
group_dates                    0.374192
individual_dates               0.302849
profession                    -0.223972
roses_from_group_dates        -0.060193
roses_from_individual_dates    0.302849
total_tweet_sentiment         -0.849981
total_tweets                   0.628029
won                            1.000000
runner_up                     -0.041667
Name: won, dtype: float64

It is interesting to note that average and total tweet sentiment have high negative correlations with winning the competition (-0.400540 and -0.849981, respectively), while the average and total number of tweets maintains a high positive correlation with winning the competition (0.454344 and 0.628029, respectively).

In [20]:
seasons_dfs[14].corr().elimination
Out[20]:
avg_num_tweets                 0.726037
avg_tweet_sentiment           -0.376984
distance                       0.402683
elimination                    1.000000
geo_cluster                   -0.170559
group_dates                    0.661066
individual_dates               0.501505
profession                    -0.176804
roses_from_group_dates         0.276867
roses_from_individual_dates    0.501505
total_tweet_sentiment         -0.583122
total_tweets                   0.881673
won                            0.554021
runner_up                      0.473494
Name: elimination, dtype: float64

The same phenomena are true when looking at these dependent variables and a contestant's elimination week.

In [21]:
sns.regplot(y="elimination", x="avg_tweet_sentiment", data=seasons_dfs[14], fit_reg = True)
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x10ebe9850>
In [22]:
sns.regplot(y="elimination", x="avg_num_tweets", data=seasons_dfs[14], fit_reg = True)
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x10ed60910>
In [ ]:
 
In [ ]: