Import Dependencies
# special IPython command to prepare the notebook for matplotlib
%matplotlib inline
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import statsmodels.api as sm
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
# special matplotlib argument for improved plots
from matplotlib import rcParams
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import requests
import json
import math
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import *
from sklearn.metrics import roc_curve, auc
# import helper commands
from helpers import *
#Import all data sources
#Data for each contestant broken down by seasons
with open('tempdata/seasonsDict.json') as json_file:
seasonsDict = json.load(json_file)
#Twitter
with open("twitter_sentiment.json") as json_file:
twitter = json.load(json_file)
#Distances
with open("distances.json") as json_file:
distances = json.load(json_file)
#Geo-Cluster
with open("geo_cluster.json") as json_file:
geo_cluster = json.load(json_file)
#Profession
with open("profession_dict.json") as json_file:
professions = json.load(json_file)
#Wiki Info
wiki = pd.read_csv("contestantDF.csv")
#Date Guide
date_guide = pd.read_csv("date_guide.csv")
# PCA data
with open('pca_dict.json') as json_file:
raw_pca_data = json.load(json_file)
pca_data = {}
for sn in distances:
pca_data[sn] = {}
for contestant in distances[sn]:
for c in raw_pca_data[sn]:
if c in contestant:
pca_data[sn][contestant] = raw_pca_data[sn][c]
#Competition Data
with open("competition_data.json") as json_file:
comp = json.load(json_file)
# Organize all the data into a single list
contestants = []
for sn in seasonsDict:
if int(sn) > 12: # only add data from seasons 13 and on
for contestant in seasonsDict[sn]:
cname = contestant['name']
# add twitter data to contestants info
foundTwitterName = False
for tname in twitter[sn]:
if tname in cname:
if len(twitter[sn][tname]) > 0:
ntweets = []
tweetSentiment = []
for date in twitter[sn][tname]:
ntweets.append(twitter[sn][tname][date]['ntweet'])
tweetSentiment.append(twitter[sn][tname][date]['sentiment'])
contestant['avg_num_tweets'] = np.mean(ntweets)
contestant['total_tweets'] = np.sum(ntweets)
contestant['avg_tweet_sentiment'] = np.mean(tweetSentiment)
contestant['total_tweet_sentiment'] = np.sum(tweetSentiment)
else:
contestant['avg_num_tweets'] = 0
contestant['total_tweets'] =0
contestant['avg_tweet_sentiment'] = 0
contestant['total_tweet_sentiment'] = 0
if cname in distances[sn]:
# add distance data
contestant['distance'] = distances[sn][cname]
# add profession data
contestant['profession'] = professions[sn][cname]
# add geo cluster data
contestant['geo_cluster'] = geo_cluster[sn][cname]
if cname in pca_data[sn]:
contestant['pc1'], contestant['pc2'] = pca_data[sn][cname]
contestants.append(contestant)
# Create data frame from the contestants dictionary
df = pd.DataFrame(contestants)
# get rid of an samples that contain NaN
#df = df.dropna()
# just in case, we drop any duplicate samples
df = df.drop_duplicates()
# get rid of the samples for bachelors
df = df[df.elimination != 'unknown']
# create a column for 'won' to indicate whether the contestant won or did not
df['won'] = (df.elimination == 'Winner')*1
df['runner_up'] = (df.elimination == 'Runner-up')*1
# winners and runners up are currently strings. We need to turn them into integers
df['elimination'] = df['elimination'].apply(change_winner_runnerup)
df['elimination'] = df['elimination'].astype('int')
# create a dataframe where the unecessary columns are droped
df_temp = df.drop(['hometown','name','occupation','season','pc1','pc2'],1)
df_temp.head(10)
df_temp.shape
df_temp.describe()
We particularly look at the correlation between dependent variables and the elimination week for each contestant as well as whether that contestant won the entire competition
df.corr().won
We look primarily at those variables that are correlated with winning the entire competition and those correleated with a higher elimination week.
Those variables with a higher positive correlation with winning the entire competition where average number of tweets, the number of total individual and group dates as well as the number of roses acquired from these dates.
df.corr().elimination
Those variables with a higer positive correlation with higher elimination weeks were the same as those variables that correlated with winning the entire competition. The correlation between these dependent variables and the elimination week, howerver, were almost three times greater than the correlation between the same dependent variables and winning the competition.
sns.regplot(y="elimination", x="avg_num_tweets", data=df, fit_reg = True)
sns.regplot(y="elimination", x="roses_from_individual_dates", data=df, fit_reg = True)
We attempt to vizualize the correlation between dependent variables that are positively correlated with the independent variables
# Plot histogram of number of tweets broken up by individual dates
g = sns.FacetGrid(col="individual_dates", data=df, size=8)
g.map(plt.hist, "avg_num_tweets")
There is a higher distribution of tweets for those individuals that received individual dates.
# Plot histogram of number of tweets broken up by individual dates
g = sns.FacetGrid(col="roses_from_individual_dates", data=df, size=8)
g.map(plt.hist, "avg_num_tweets")
Likewise, there is a higher distribution of tweets for those that candidates that received a rose from individual dates.
# Plot histogram of number of tweets broken up by
g = sns.FacetGrid(col="roses_from_individual_dates", data=df, size=8)
g.map(plt.hist, "avg_tweet_sentiment")
We also look to see if roses from individual dates had a higher distribution of contestants that had higher avgerage twitter sentiment. Theres a relatively higher distribution of contestants who received roses from individual dates and received had higher twitter sentiments.
We want to run a logistics regression to determine the probability of a contestant winning the competition.
# fit logistic regression model
import statsmodels.api as sm
from statsmodels.formula.api import logit, glm, ols
logit_model = logit('won ~ avg_num_tweets + roses_from_individual_dates',df).fit()
logit_model.summary()
The only dependent variables that were statistically significant were average number of tweets as well as the number roses a contestant received from individual dates. Consequently, our R squared is very low because of the small number of dependent variable used in the regression.
We want to run a linear regression to estimate a contestants elimination week
model = ols('elimination ~ avg_num_tweets + roses_from_individual_dates + roses_from_group_dates + pc2 ',df).fit()
model.summary()
The linear regression provides a better model for estimating the episode that a contestant will be eliminated. In addition to average number of tweets and roses from individual dataes, roses from group dates and the principal component 2 were aslo statistically significant.
features = list(model.params.keys())
coefs = list(model.params.values)
plt.bar(range(len(features)),coefs, align="center", width=.5)
plt.xticks(range(len(features)), features, rotation=45)
plt.ylabel("Regression Coefficients")
plt.title("Regression Results")
plt.show()
We want to breakdown the data by season to account for the differences in the number of tweets from each season, since the average number of tweets about all contestants should increase from year to year due to twitter's broader addoption.
seasons_dfs = {}
for sn in range(13,20):
seasons_dfs[sn] = df_temp[df['season'] == sn]
seasons_dfs[14].head()
seasons_dfs[14].corr().won
It is interesting to note that average and total tweet sentiment have high negative correlations with winning the competition (-0.400540 and -0.849981, respectively), while the average and total number of tweets maintains a high positive correlation with winning the competition (0.454344 and 0.628029, respectively).
seasons_dfs[14].corr().elimination
The same phenomena are true when looking at these dependent variables and a contestant's elimination week.
sns.regplot(y="elimination", x="avg_tweet_sentiment", data=seasons_dfs[14], fit_reg = True)
sns.regplot(y="elimination", x="avg_num_tweets", data=seasons_dfs[14], fit_reg = True)