Classify + Predict Weekly Success on The Bachelor¶

%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests
import json
import math

from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc

First we import all data sources

#Import all data sources

#Twitter
with open("twitter_sentiment.json") as json_file:
    twitter = json.load(json_file)
    
#Distances
with open("distances.json") as json_file:
    distances = json.load(json_file)
    
#Geo-Cluster
with open("geo_cluster.json") as json_file:
    geo_cluster = json.load(json_file)
    
#Profession
with open("profession_dict.json") as json_file:
    professions = json.load(json_file)
    
#Wiki Info
wiki = pd.read_csv("contestantDF.csv")

#Date Guide
date_guide = pd.read_csv("date_guide.csv")

#Competition Data
with open("competition_data.json") as json_file:
    comp = json.load(json_file)

#Rename Wiki Dataframe - column elimination week
#If a winner - eliminated week 11, if runner-up - eliminated week 10
#If bachelor - eliminated week 0
#If season 13-14, if elimination week = 2-8, add 2

new_elim=  wiki['elimination week'].tolist()
new_elim = ['11' if qq=="Winner" else qq for qq in new_elim]
new_elim = ['10' if qq=="Runner-up" else qq for qq in new_elim]
new_elim = ['0' if qq=="bachelor" else qq for qq in new_elim]
new_elim = [qq.split(" ")[-1] for qq in new_elim]

all_seasons = wiki['season'].tolist()

good_elim = []

for elim, season in zip(new_elim, all_seasons):
    if season in [13,14] and int(elim) > 1 and int(elim) < 10:
        good_elim.append(str(int(elim)+2))
    else:
        good_elim.append(elim)
        
wiki["good_elim"] = good_elim

Fitting a Classifier¶

We fit a different classifier for each elimination week of the Bachelor.

We use a Logistic Classifier with an L1 penalty - similar to the process of HW3. Our response variable is whether the contestant was eliminated or advanced past that week's competition.

#Borrow the do_classify and cv_optimize functions from HW3
#These will help us get the best hyperparameters for our classifier
def do_classify(clf, parameters, indf, featurenames, targetname, target1val, mask=None, reuse_split=None, score_func=None, n_folds=5):
    subdf=indf[featurenames]
    X=subdf.values
    y=(indf[targetname].values==target1val)*1
    if mask !=None:
        print "using mask"
        Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
    if reuse_split !=None:
        print "using reuse split"
        Xtrain, Xtest, ytrain, ytest = reuse_split['Xtrain'], reuse_split['Xtest'], reuse_split['ytrain'], reuse_split['ytest']
    if parameters:
        clf = cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=n_folds, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    training_accuracy = clf.score(Xtrain, ytrain)
    test_accuracy = clf.score(Xtest, ytest)
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.2f" % (training_accuracy)
    print "Accuracy on test data:     %0.2f" % (test_accuracy)
    print confusion_matrix(ytest, clf.predict(Xtest))
    print "########################################################"
    return clf, Xtrain, ytrain, Xtest, ytest

#We also borrow cv_optimize from HW3 which runs a gridSearch to find best hyperparameters
#This function is nested inside the do_classify function
def cv_optimize(clf, parameters, X, y, n_folds=5, score_func=None):    
    #Run a grid search with or without a custom scoring function
    if score_func:
        grid_search = GridSearchCV(clf, param_grid=parameters, cv=n_folds, 
                                   n_jobs=1, scoring=score_func)
    else:
        grid_search = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=1)
        
    #Fit the data based grid search
    grid_search.fit(X,y)
    
    #Grab the best estimator
    best = grid_search.best_estimator_
    
    return best

Week 1¶

We start with Week 1, where we predict performance based on each contestant's fundamental data.

#Week 1 - Based only on fundamentals data
#Where fundamentals = geo-cluster, profession, age, photo, distances
week1_contestants = wiki[wiki['good_elim'] != "0"]
week1_success = week1_contestants['good_elim']!="1"
all_bach = wiki[wiki['good_elim'] == "0"]

#Get all data sources from libraries
distances_series = []
geo_series = []
profession_series = []
age_diff_series = []
for season, cname, cage in zip(week1_contestants['season'],week1_contestants['name'], \
                              week1_contestants['age']):
    distances_series.append(distances[str(season)][cname])
    geo_series.append(geo_cluster[str(season)][cname])
    profession_series.append(professions[str(season)][cname])
    
    age_diff_series.append(int(all_bach[all_bach["season"]==season]["age"]) - int(cage))
    
age_series = week1_contestants['age']

#Normalize distances
def normal_func(a_series):
    meanval = np.mean(a_series)
    sdval = np.std(a_series)
    return [(qq - meanval) / sdval for qq in a_series]

distances_norm = normal_func(distances_series)

#Put data into pandas dataframe
week1_data = pd.DataFrame({"SUCCESS": week1_success, "DIST": distances_series, \
                           "GEO": geo_series, "PROF":profession_series, \
                           "AGE_DIFF":age_diff_series, "AGE":age_series, \
                           "DIST_NORM": distances_norm})

To find the best hyperparameters for the logistic classifier, we borrow the do_classify and cv_optimize functions from HW3. The do_classify function takes our predictor/response dataframe, and fits/tests the best classifier returned from cv_optimize. The function cv_optimize runs a grid search over a list of hyperparameters with n-fold cross-validation, and returns the classfier with optimal hyperparameter (i.e. regularization) that best fits the training data. We report the training and test accuracy of the classifier

#Make training/test masks
itrain, itest = train_test_split(xrange(week1_data.shape[0]), train_size=0.7)
mask=np.ones(week1_data.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

#Find the best logistic classifier - report accuracy
CC = {"C": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}
pred_col = ['AGE_DIFF', 'DIST_NORM', 'GEO', 'PROF']
clf, Xtrain, ytrain, Xtest, ytest = do_classify(LogisticRegression(penalty="l1"), \
                                                   CC, week1_data, pred_col, \
                                                   'SUCCESS',1, mask=mask, n_folds=10)

print sum(ytest) / float(len(ytest))
print sum(ytrain) / float(len(ytrain))
print clf.predict(Xtest)

plt.bar(range(len(clf.coef_[0])),clf.coef_[0], align="center", width=.5)
plt.xticks(range(len(clf.coef_[0])), pred_col)
plt.ylabel("Regression Coefficients")
plt.title("Week 1 - Regression Results")
plt.show()

using mask
############# based on standard predict ################
Accuracy on training data: 0.71
Accuracy on test data:     0.61
[[ 1 19]
 [ 3 34]]
########################################################
0.649122807018
0.69465648855
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]

/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:7: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.

Get Data / Fit Classifier for Other Weeks¶

We define a function that pulls in all our data depending on the elimination week in question. These data include fundamentals, twitter data, and date outcomes.

An additional caveat is that between seasons 14 and 15, The Bachelor expanded from an 8 episode to a 10 episode format. Thus to try and keep the data consistent, we only do analyses for weeks 2 and 3 on the season 15-19 data. When we get to Week 4 - we will use Week 2 data from seasons 13 and 14.

#Define a function to get all data for a certain week 
def get_week_data(which_week, early_episode):
    
    if which_week == 2:
        select_week = ["0", "1"]
    else:
        select_week = ["0"] + [str(qq) for qq in range(1,which_week)]
    week_contestants = wiki[~wiki['good_elim'].isin(select_week)]
    season_range=range(13,20)
    if early_episode:
        week_contestants = week_contestants[~wiki['season'].isin([13,14])]
        season_range = range(15,20)
    week_success = week_contestants['good_elim']!=str(which_week)
    all_bach = wiki[wiki['good_elim'] == "0"]
    
    #Get right episode dates
    right_dates = date_guide[date_guide['Episode']==which_week]
    right_dates = right_dates[right_dates['Season'].isin(season_range)]

    #Get all data sources from libraries
    distances_series = []
    geo_series = []
    profession_series = []
    age_diff_series = []
    num_tweets = []
    tweet_sent = []
    group_rose = []
    date_rose = []
    group_date = []
    total_roses_season = []
    total_season_tweets = dict(zip(season_range, np.zeros(len(season_range))))
    for season, cname, cage in zip(week_contestants['season'],week_contestants['name'], \
                                  week_contestants['age']):
        #Get distances / spatial cluster
        distances_series.append(distances[str(season)][cname])
        geo_series.append(geo_cluster[str(season)][cname])

        #Get profession
        profession_series.append(professions[str(season)][cname])

        #Get Age Difference
        age_diff_series.append(int(all_bach[all_bach["season"]==season]["age"]) - int(cage))

        #Get Twitter Data
        tname = cname.split(" ")[0]
        try:
            episode = right_dates[right_dates['Season']==season]['Date'].tolist()[0]
            right_tweets = twitter[str(season)][tname][str(episode)]
            num_tweets.append(right_tweets["ntweet"])
            tweet_sent.append(right_tweets["sentiment"])
            total_season_tweets[season] = total_season_tweets[season] + right_tweets["ntweet"]
        except:
            num_tweets.append(0)
            tweet_sent.append(0)
            
        #Get competition Data
        season_dat = wiki[wiki['season'] == season].copy()
        season_dat['firstname'] = [qq.split(" ")[0] for qq in season_dat['name'].tolist()]
        first_names_all = season_dat['name'].groupby(season_dat['firstname']).count()
        first_names = dict(zip(first_names_all.index.tolist(), first_names_all.tolist()))
        
        if first_names[tname] == 2:
            comp_name = cname.split(" ")[0] + " " + cname.split(" ")[1][0]
        else:
            comp_name = tname
        
        if which_week != 2:
            try:
                daters = comp[str(season)][str(which_week - 1)].keys()
                if comp_name in daters:
                    contestant_date = comp[str(season)][str(which_week - 1)][comp_name]
                    if "grou" in contestant_date.keys()[0]:
                        group_date.append(1)
                        date_rose.append(0)
                        if contestant_date.values() == True:
                            group_rose.append(1)
                        else:
                            group_rose.append(0)
                    else:
                        group_date.append(0)
                        group_rose.append(0)
                        if contestant_date.values()[0] == True:
                            date_rose.append(1)
                        else:
                            date_rose.append(0)
                else:
                    group_date.append(0)
                    group_rose.append(0)
                    date_rose.append(0)
            except:
                group_date = np.zeros(len(num_tweets))
                group_rose = np.zeros(len(num_tweets))
                date_rose = np.zeros(len(num_tweets))
                
        else:
            group_date = np.zeros(len(num_tweets))
            group_rose = np.zeros(len(num_tweets))
            date_rose = np.zeros(len(num_tweets))
                    
    #Get Tweet Share - i.e. proportion of contestant tweets to total tweets / episode
    tweet_share = []
    for season, ntweet in zip(week_contestants['season'], num_tweets):
        tweet_share.append(ntweet/float(total_season_tweets[season]))

    #Get raw age of each contestant
    age_series = week_contestants['age']

    #Normalize distances
    def normal_func(a_series):
        meanval = np.mean(a_series)
        sdval = np.std(a_series)
        return [(qq - meanval) / sdval for qq in a_series]

    distances_norm = normal_func(distances_series)

    #Put data into pandas dataframe
    week_data = pd.DataFrame({"SUCCESS": week_success, "DIST": distances_series, \
                               "GEO": geo_series, "PROF":profession_series, \
                               "AGE_DIFF":age_diff_series, "AGE":age_series, \
                               "DIST_NORM": distances_norm, "TSENT": tweet_sent, \
                              "TWEET_SHARE": tweet_share, "GROSE": group_rose, \
                             "GDATE": group_date, "ODATE": date_rose})
    
    return week_data, week_success

Week 2¶

week2_data, week2_success = get_week_data(2, True)

#Make training/test masks
itrain, itest = train_test_split(xrange(week2_data.shape[0]), train_size=0.7)
mask=np.ones(week2_data.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

#Find the best logistic classifier - report accuracy
CC = {"C": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}
pred_col = ['AGE_DIFF', 'DIST_NORM', 'GEO', 'PROF', 'TSENT', 'TWEET_SHARE']
clf, Xtrain, ytrain, Xtest, ytest = do_classify(LogisticRegression(penalty="l1"), \
                                                   CC, week2_data, pred_col, \
                                                   'SUCCESS',1, mask=mask, n_folds=3)

print sum(ytest) / float(len(ytest))
print sum(ytrain) / float(len(ytrain))
print clf.predict(Xtest)

plt.bar(range(len(clf.coef_[0])),clf.coef_[0], align="center", width=.5)
plt.xticks(range(len(clf.coef_[0])), pred_col)
plt.ylabel("Regression Coefficients")
plt.title("Week 2 - Regression Results")
plt.show()

using mask
############# based on standard predict ################
Accuracy on training data: 0.99
Accuracy on test data:     0.87
[[ 2  2]
 [ 2 24]]
########################################################
0.866666666667
0.823529411765
[1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0]

/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:7: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.

Week 3¶

week3_data, week3_success = get_week_data(3, True)

#Make training/test masks
itrain, itest = train_test_split(xrange(week3_data.shape[0]), train_size=0.7)
mask=np.ones(week3_data.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

#Find the best logistic classifier - report accuracy
CC = {"C": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}
pred_col = ['AGE_DIFF', 'DIST_NORM', 'GEO', 'PROF', 'TSENT', 'TWEET_SHARE', \
            'GROSE', 'GDATE', 'ODATE']
clf, Xtrain, ytrain, Xtest, ytest = do_classify(LogisticRegression(penalty="l1"), \
                                                   CC, week3_data, pred_col, \
                                                   'SUCCESS',1, mask=mask, n_folds=3)

print sum(ytest) / float(len(ytest))
print sum(ytrain) / float(len(ytrain))
print clf.predict(Xtest)

plt.bar(range(len(clf.coef_[0])),clf.coef_[0], align="center", width=.5)
plt.xticks(range(len(clf.coef_[0])), pred_col, fontsize=10)
plt.ylabel("Regression Coefficients")
plt.title("Week 3 - Regression Results")
plt.show()

using mask
############# based on standard predict ################
Accuracy on training data: 0.98
Accuracy on test data:     0.84
[[ 3  0]
 [ 4 18]]
########################################################
0.88
0.80701754386
[1 0 0 0 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0]

/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:7: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.

Week 4¶

week4_data, week4_success = get_week_data(4, True)

#Make training/test masks
itrain, itest = train_test_split(xrange(week4_data.shape[0]), train_size=0.7)
mask=np.ones(week4_data.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

#Find the best logistic classifier - report accuracy
CC = {"C": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}
pred_col = ['AGE_DIFF', 'DIST_NORM', 'GEO', 'PROF', 'TSENT', 'TWEET_SHARE', \
            'GROSE', 'GDATE', 'ODATE']
clf, Xtrain, ytrain, Xtest, ytest = do_classify(LogisticRegression(penalty="l1"), \
                                                   CC, week4_data, pred_col, \
                                                   'SUCCESS',1, mask=mask, n_folds=3)

print sum(ytest) / float(len(ytest))
print sum(ytrain) / float(len(ytrain))
print clf.predict(Xtest)

plt.bar(range(len(clf.coef_[0])),clf.coef_[0], align="center", width=.5)
plt.xticks(range(len(clf.coef_[0])), pred_col, fontsize=10)
plt.ylabel("Regression Coefficients")
plt.title("Week 4 - Regression Results")
plt.show()

using mask
############# based on standard predict ################
Accuracy on training data: 0.96
Accuracy on test data:     0.95
[[ 4  1]
 [ 0 16]]
########################################################
0.761904761905
0.829787234043
[1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 1 1 1 0]

/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:7: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.

Week 5¶

week5_data, week5_success = get_week_data(5, False)

#Make training/test masks
itrain, itest = train_test_split(xrange(week5_data.shape[0]), train_size=0.7)
mask=np.ones(week5_data.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

#Find the best logistic classifier - report accuracy
CC = {"C": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}
pred_col = ['AGE_DIFF', 'DIST_NORM', 'GEO', 'PROF', 'TSENT', 'TWEET_SHARE', \
            'GROSE', 'GDATE', 'ODATE']
clf, Xtrain, ytrain, Xtest, ytest = do_classify(LogisticRegression(penalty="l1"), \
                                                   CC, week5_data, pred_col, \
                                                   'SUCCESS',1, mask=mask, n_folds=3)

print sum(ytest) / float(len(ytest))
print sum(ytrain) / float(len(ytrain))
print clf.predict(Xtest)

plt.bar(range(len(clf.coef_[0])),clf.coef_[0], align="center", width=.5)
plt.xticks(range(len(clf.coef_[0])), pred_col, fontsize=10)
plt.ylabel("Regression Coefficients")
plt.title("Week 5 - Regression Results")
plt.show()

using mask
############# based on standard predict ################
Accuracy on training data: 0.87
Accuracy on test data:     0.88
[[ 3  3]
 [ 0 18]]
########################################################
0.75
0.781818181818
[1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1]

/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:7: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.

Week 6¶

week6_data, week6_success = get_week_data(6, False)

#Make training/test masks
itrain, itest = train_test_split(xrange(week6_data.shape[0]), train_size=0.7)
mask=np.ones(week6_data.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

#Find the best logistic classifier - report accuracy
CC = {"C": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}
pred_col = ['AGE_DIFF', 'DIST_NORM', 'GEO', 'PROF', 'TSENT', 'TWEET_SHARE', \
            'GROSE', 'GDATE', 'ODATE']
clf, Xtrain, ytrain, Xtest, ytest = do_classify(LogisticRegression(penalty="l1"), \
                                                   CC, week6_data, pred_col, \
                                                   'SUCCESS',1, mask=mask, n_folds=3)

print sum(ytest) / float(len(ytest))
print sum(ytrain) / float(len(ytrain))
print clf.predict(Xtest)

plt.bar(range(len(clf.coef_[0])),clf.coef_[0], align="center", width=.5)
plt.xticks(range(len(clf.coef_[0])), pred_col, fontsize=10)
plt.ylabel("Regression Coefficients")
plt.title("Week 6 - Regression Results")
plt.show()

using mask
############# based on standard predict ################
Accuracy on training data: 0.88
Accuracy on test data:     0.95
[[ 7  0]
 [ 1 11]]
########################################################
0.631578947368
0.666666666667
[1 0 0 1 0 0 1 0 1 1 1 0 0 1 1 1 0 1 1]

/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:7: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.

week7_data, week7_success = get_week_data(7, False)

#Make training/test masks
itrain, itest = train_test_split(xrange(week7_data.shape[0]), train_size=0.7)
mask=np.ones(week7_data.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

#Find the best logistic classifier - report accuracy
CC = {"C": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}
pred_col = ['AGE_DIFF', 'DIST_NORM', 'GEO', 'PROF', 'TSENT', 'TWEET_SHARE', \
            'GROSE', 'GDATE', 'ODATE']
clf, Xtrain, ytrain, Xtest, ytest = do_classify(LogisticRegression(penalty="l1"), \
                                                   CC, week7_data, pred_col, \
                                                   'SUCCESS',1, mask=mask, n_folds=3)

print sum(ytest) / float(len(ytest))
print sum(ytrain) / float(len(ytrain))
print clf.predict(Xtest)

plt.bar(range(len(clf.coef_[0])),clf.coef_[0], align="center", width=.5)
plt.xticks(range(len(clf.coef_[0])), pred_col, fontsize=10)
plt.ylabel("Regression Coefficients")
plt.title("Week 7 - Regression Results")
plt.show()

using mask
############# based on standard predict ################
Accuracy on training data: 0.89
Accuracy on test data:     0.75
[[3 2]
 [1 6]]
########################################################
0.583333333333
0.75
[1 0 0 1 1 1 1 1 0 1 1 0]

/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:7: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.

Analysis¶

We see from the regression coefficients that a contestant's share of tweets from the previous episode has a great impact on their ability to survive elimination the following week.

We only up to Week 7 for two reasons: 1) The data becomes very poor as only a few contestants per season survive past Week 7. 2) The format of the show changes after Week 7, namely all contestants get one-on-one dates, and all have a lot of screen time. We would need other indicators for a new type of prediction.