Here we create an predictor that is defined as the geographic (as the crow flies) distance between The Bachelor's hometown and each contestant
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests
import json
from geopy import geocoders
import math
#Import seasonsDict.json which contains hometown information
#This comes from the Wikipedia Scrape
with open("tempdata/listAllDicts.json") as json_file:
seasons = json.load(json_file)
Here get the hometown information from each candidate
#Make a function that get's contestant's hometown for a given season
#Update known errors in the dictionary
def get_hometowns(choose_season):
town_dict = {}
for idict in seasons:
if idict["season"] == choose_season:
if idict["elimination"] == "bachelor":
bachtown = idict["hometown"]
else:
if idict["name"] == "Kacie Boguskie":
idict["hometown"] = "Clarksville, Tennessee"
town_dict.update({idict['name']:idict['hometown']})
return bachtown, town_dict
We use a function that computes the distance between two points on a sphere, which follows the following equation:
$$ \cos(arc length) = \cos{\phi_1}\cos{\phi_2} + \sin{\phi_1}\sin{\phi_2}\cos{(\theta_1 - \theta_2)}$$We then multiply the arc length by the radius of the Earth to get the distance.
#Create a function to compute distance between two lat/lon points on Earth
#Borrows heavily from http://www.johndcook.com/blog/python_longitude_latitude/
def distance_on_unit_sphere(lat1, long1, lat2, long2):
# Convert latitude and longitude to
# spherical coordinates in radians.
degrees_to_radians = math.pi/180.0
# phi = 90 - latitude
phi1 = (90.0 - lat1)*degrees_to_radians
phi2 = (90.0 - lat2)*degrees_to_radians
# theta = longitude
theta1 = long1*degrees_to_radians
theta2 = long2*degrees_to_radians
# Compute spherical distance from spherical coordinates.
# For two locations in spherical coordinates
# (1, theta, phi) and (1, theta', phi')
# cosine( arc length ) =
# sin phi sin phi' cos(theta-theta') + cos phi cos phi'
# distance = rho * arc length
cos = (math.sin(phi1)*math.sin(phi2)*math.cos(theta1 - theta2) +
math.cos(phi1)*math.cos(phi2))
arc = math.acos( cos )
Rearth = 3959 #in miles
return arc * Rearth
To convert hometowns in (city, state) format, we use the python package "geopy" which accepts input in our format, and returns the lat/lon coordinates of the center of town.
def get_contest_distance(hometown_output):
bach_home, c_home = hometown_output
#Use geopy package to get lat/lon from location
gn = geocoders.Nominatim()
#Get the Bachelor location
bhome = gn.geocode(bach_home)
blat = float(bhome.raw["lat"])
blon = float(bhome.raw["lon"])
#Find distance between bachelor and contestant distances
distance_dict = {}
grid_dict = {}
for contestant in c_home.keys():
chome = gn.geocode(c_home[contestant])
clat = float(chome.raw["lat"])
clon = float(chome.raw["lon"])
#Find distance
distance_apart = distance_on_unit_sphere(blat, blon, clat, clon)
distance_dict.update({contestant: distance_apart})
#Save lat/lon
grid_dict.update({contestant: (clat,clon)})
return distance_dict, grid_dict
We run the distance function over all seasons and save the output as a dictionary.
season_nums = range(13,20)
#Run over all seasons
distance_dict = {}
for season_num in season_nums:
distance_dict.update({season_num: get_contest_distance(get_hometowns(season_num))[0]})
print "season ", season_num, " done"
#Run again to get lat/lon for visualization
season_nums = range(13,20)
#Run over all seasons
grid_dict = {}
for season_num in season_nums:
grid_dict.update({season_num: get_contest_distance(get_hometowns(season_num))[1]})
print "season ", season_num, " done"
with open('grid_dict.json', 'w') as fp:
json.dump(grid_dict, fp)
#with open('distances.json', 'w') as fp:
# json.dump(distance_dict, fp)
Here we visualize the distance between the winning contestant's hometown and the Bachelor's hometown for each Season using the Basemap library.
from mpl_toolkits.basemap import Basemap
# Make a function that maps hometowns and draws a line from Bachelor hometown to
# winner hometown. Processes one Season at a time.
#
# NOTE: Only maps continental US, because there are no contestants/bachelors from HI, Alaska, etc.
#
# Input:
# hometown = get_hometown() output (bach hometown and dict of contestant hometowns)
# subplotNum = position on grid of plots
# winnerTownNum = index of 'winnertowns' array (array of winners' hometowns)
# seasonNum = season number
def make_maps(hometown_output, subplotNum, winnerTownNum, seasonNum):
#Get Bachelor hometown, dict of Contestant hometowns for this season
bachelor_home, contestant_home = hometown_output
#Add a new subplot of USA to the grid
ax=fig.add_subplot(subplotNum)
m = Basemap(projection='merc',llcrnrlat=24,urcrnrlat=50,\
llcrnrlon=-127,urcrnrlon=-65,lat_ts=20,resolution='c')
m.drawmapboundary(fill_color='none')
m.drawcoastlines()
m.drawcountries()
#Use geopy package to get lat/lon from location
gn = geocoders.Nominatim()
#Get the Bachelor location
bachHome = gn.geocode(bachelor_home, timeout=120)
bachLat = float(bachHome.raw["lat"])
bachLon = float(bachHome.raw["lon"])
#Put a big blue circle on Bachelor's hometown
x,y = m(bachLon, bachLat)
m.plot(x,y, color='blue', marker='o') # label='Bachelor')
#For each contestant in this season,
for contestant in contestant_home.keys():
#get coordinates of hometowns
contHome = gn.geocode(contestant_home[contestant], timeout=120)
contLat = float(contHome.raw["lat"])
contLon = float(contHome.raw["lon"])
#put a light blue dot on Contestant's hometown
xpt,ypt = m(contLon, contLat)
m.plot(xpt, ypt, color='lightskyblue', marker='.') #, label='Ladies')
#Put a red star on winner's hometown
winnerHome = gn.geocode(winnertowns[winnerTownNum], timeout=120)
winnerLat = float(winnerHome.raw["lat"])
winnerLon = float(winnerHome.raw["lon"])
wx,wy = m(winnerLon, winnerLat)
m.plot(wx, wy, color='red', marker='*') # label='Winner')
#Draw red line from Bachelor hometown to Winner hometown
m.drawgreatcircle(bachLon, bachLat, winnerLon, winnerLat, linewidth=2, color='red')
ax.set_title('Season ' + `seasonNum`)
import matplotlib.patches as mpatches
#Make Array of winners' hometowns
winnertowns = []
for sdict in seasons:
if sdict["elimination"] == "bachelor":
winnertowns.append(sdict["hometown"])
#Set up plot figure and variables
fig = plt.figure(figsize=(12,11))
subplotNum = 331 #3=numrows, 3=numcols, 1=starting axis to use
winnerTownNum = 0 #index in array of winners' hometowns
#Add plots for each season
season_nums = range(13,20)
for seasonNum in season_nums: #for each season,
#add a plot to the grid
make_maps(get_hometowns(seasonNum), subplotNum, winnerTownNum, seasonNum)
subplotNum += 1 #go to next plot position on grid
winnerTownNum += 1 #get next Winner hometown name from array
#Add title for whole grid
fig.suptitle("Bachelors' and Contestants' Hometowns", fontsize=28 )
#Make and add a legend
blue_patch = mpatches.Patch(color='blue',label='Bachelor')
lightblue_patch = mpatches.Patch(color='lightskyblue',label='Contestants')
red_patch = mpatches.Patch(color='red',label='Winner')
fig.legend((blue_patch, lightblue_patch, red_patch), ('Bachelors', 'Contestants', 'Winners'), 'best') #, shadow=True, fontsize=24) #loc=2, bbox_to_anchor=(1.05,1), fancybox=True, ncol=1, shadow=True, fontsize=18)
#Set figure to tight layout, use 'rect' so that suptitle is above grid of plots
fig.tight_layout(rect=[0,0.03,1,0.95])
We can see that there doesn't seem to be a consistent pattern across all seasons - i.e. each Bachelor doesn't always choose contestant's who come from similar regions.