Here we scrape wikipedia pages that contain information about each season of The Bachelor. We are interested in the following fundamental data types: Age, Profession, Hometown. Additionally, for each week of the compeition, we scrape to find out if a contestant received a group date, a group rose, or a one-on-one date.
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests
from competition_data import *
Here we get the raw HTML from each page
#get wiki for all bachelor seasons
allseasons = requests.get("https://en.wikipedia.org/wiki/The_Bachelor_(U.S._TV_series)#Seasons")
soup = BeautifulSoup(allseasons.text, "html.parser") #make soup element
#get the table cell that has links to each episode
seasons = soup.find("table", attrs={"class":"navbox"}).find("td", attrs={"class":"navbox-list navbox-odd hlist"})
seasons = seasons.find("div", attrs={"style":"padding:0em 0.25em"}).find("ul")
urls = [] #list of links to season-specific page
seasonNums = [] #list of seasons w/ wiki pages (no seasons 1-4 or 6-8)
seasonNum = 1 #season number
for item in seasons.find_all("li"): #for each item in list of seasons
if (seasonNum == 20): #don't include season 20, b/c no contestants listed yet
break
season = item.find("a") #get url tag
if season is not None: #if has url link, get url text
urls.append("\"https://en.wikipedia.org" + season.get("href") + "\"")
seasonNums.append(seasonNum) #add season number to list
seasonNum += 1
wikiPageText = [] #init list of wiki site text, for all seasons
for url in urls:
site = requests.get(url[1:-1]) #get web-site for that url
soup = BeautifulSoup(site.text, "html.parser") #make BS element
wikiPageText.append(soup) #add web-site text to list
wikiPages = dict(zip(seasonNums, wikiPageText)) #key=season, val=Soup Elem(wiki page text)
Now we parse through the tags of the HTML to get the fundamental data sources we are looking for.
# For each season in wiki, make list of dictionaries - one dictionary for each contestant.
# Also, make a list of dictionaries of all contestants for all seasons.
#
# list name = listAllDicts #a list of all dicts for all contestants and bachelors, all seasons
#
# dictionary name = seasonsDict
# key = season number
# value = list of dictionaries for that season (one for each contestant)
#
# dictionary name = contestantDict
# keys = name, age, hometown, occupation, elimination, season
# values = associated values to fields, as scraped from wiki
#
# To test contestant dictionaries:
# print seasonsDict[season][contestant][fieldname]
# eg: print seasonsDict[9][10]['name'] -- get name for season 9, contestant 10
#
# Note: Wiki does not have pages dedicated to Seasons 1-4, or 6-8. I added 2, 4, 6, and 8
# below, from other sources. Contestants for episode 20 are not added, because they are
# not public yet.
#
import sys
reload(sys)
sys.setdefaultencoding('utf8')
seasonsDict = dict() #key = season num, val=list of contestant dictionaries
allContestants = dict() #keys = name/age/etc, values = associated data
listAllDicts = [] #list of dicts for all cont. and bach. for ALL seasons
for sn in seasonNums:
seasonPage = wikiPages[sn] #get BS element for this season
seasonPage = seasonPage.find("div", attrs={"id":"content"}).find("div", attrs={"id":"bodyContent"})
seasonPage = seasonPage.find("div", attrs={"id":"mw-content-text"})
seasonPage = seasonPage.find("table", attrs={"class":"wikitable sortable"})
listOfContestantDicts = [] #list of dicts for each contestant
numtr = 0 #num rows (one per contestant)
for tr in seasonPage.find_all("tr"):#for each contestant listed,
if (numtr == 0): #skip first row (column headers)
numtr += 1
continue
contestantDict = dict() #init new dict for contestant
numtd = 0 #column number
for td in tr.find_all("td"): #for each column of data,
#NAME
if (numtd == 0):
name = str(td.contents)
if ("<b>" in name):
td.find("b")
name = str(td.contents)[4:-5]
if (("[u'" in name) or ("[u\"" in name)): #if "[u'name']",
name = name.encode('utf8')[3:-2] #format to get 'name'
if ("<span class" in name):
td.find("span", attrs={"class":"nowrap"})
tag = "<span class='nowrap'>" #start tag before name
name = str(td.contents)[len(tag)+1:]#cut out start tag
end = name.index("<") #get start point of end tag
name = name[:end] #cut out end tag
trashTag = "style=\"display:none;\">" #weird tag to cut from a name
if (trashTag in name):
name = name[(len(trashTag)+1):-1]
if ("<sup" in name): #if name has "name', <sup ...",
end2 = name.index("<sup") #format to get name
name = name[:end2-3]
if ("</b" in name):
name = name[:name.index("</b")]
if ("href" in name): #if name has url
name = td.find("a").get("href")
name = td.get_text("title")
if ("title" in name): #if 'title' in name, take it out
name = name[:name.index("title")] #eg "Keltie Colleentitle[title20title]"
contestantDict['name'] = name #add name to dict
#AGE
if (numtd == 1):
age = str(td.contents)
if ("<b>" in age):
td.find("b")
age = str(td.contents)[4:-5]
if ("[u'" in age):
age = age.encode('utf8')[3:5]
if (age is None): #if no age (eg season 9, Cosetta Blanca)
age = "na"
contestantDict['age'] = age
#HOME
if (numtd == 2):
home = ""
for url in td.find_all("a"): #for each url to a place,
url.get("href")
home2 = url.get_text("title") #get place name
if (len(home) > 0): #if already have city,
home = home + ", " + home2 #concatenate state
else: #if no city yet (or home is one word),
home = home2 #save city name or home name
if ("title" in home): #format oddity in season 19, contest 1
indx = home.index("title")
home = home[:indx]
if ("[" in home): #format oddity - homes end in ", ["
indx2 = home.index("[")
home = home[:indx2-2]
contestantDict['hometown'] = home
#OCCUPATION
if (numtd == 3):
job = str(td.contents)
if ("<b>" in job):
td.find("b")
job = str(td.contents)[4:-5]
if (("[u'" in job) or ("[u\"" in job)):
job = job.encode('utf8')[3:-2]
if ("href" in job): #if occupation has url
job = td.find("a").get("href")
job = td.get_text("title")
if ("nowrap" in job):
job = td.get_text("span") #, attr={"class":"nowrap"})
if ("title" in job):
titleindex = job.index("title")
job = job[:titleindex] + " " + job[(titleindex+len("title")):]
if ("title" in job): #sometimes, 'title' appears twice in 'occupation'
titleindex = job.index("title")
job = job[:titleindex] + " " + job[(titleindex+len("title")):]
if ("below" in job):
job = "unknown"
contestantDict['occupation'] = job
#ELIMINATION
if (numtd == 4):
elim = str(td.contents)
if ("<b>" in elim):
td.find("b")
elim = str(td.contents)[4:-5]
if ("[u'" in elim):
elim = elim.encode('utf8')[3:-2]
if("Eliminated in " in elim):
elim = elim[len("Eliminated in "):]
if(("Quit in " in elim) or ("quit in " in elim)):
elim = elim[len("Quit in "):]
if(("Week " in elim) or ("week " in elim)): #remove "week", leave week number only
elim = elim[len("Week "):]
if (("Returned" in elim) or ("returned" in elim)):
elim = elim[:elim.index("', <br/>")]
contestantDict['elimination'] = elim
numtd += 1
numtr += 1
contestantDict['season'] = sn #include season num in dict
listOfContestantDicts.append(contestantDict) #add dict to list of dicts in this season
listAllDicts.append(contestantDict) #add dict to list of all dicts in all seasons
seasonsDict[sn] = listOfContestantDicts #key = season num, val=list of contestant dicts
Now we get the "competitive" data - i.e. whether a contestant received a rose in a given week.
from types import *
from collections import Counter
import operator
import re
def getCompetitionDetails(wikiPages, seasons):
competitionDetails = dict()
newerSeasons = [18,19] # these season have a different page layout and must be handled differently
for sn in seasons:
weeklyDetails = []
seasonPage = wikiPages[sn] #get BS element for this season
seasonPage = seasonPage.find("div", attrs={"id":"content"}).find("div", attrs={"id":"bodyContent"})
seasonPage = seasonPage.find("div", attrs={"id":"mw-content-text"})
bodyElements = seasonPage.find_all(['h2','h3','p','table']) #put body elements into a list so they can be parsed through
# iterater over the list of body elements to find the relevent data
for i, element in enumerate(bodyElements):
if sn in newerSeasons:
# find the episodes section
if element.name == 'h2' and not type(element.span) == NoneType:
if element.span.text == "Episodes":
# capture each table row into the episodesDetail list excluding
# the first tr, which is the table header.
# the rows are arranged in pairs, with the first the row containing
# the episode num, title and air date and the second row containing
# the episode summary
episodesDetails = bodyElements[i+1].find_all('tr')
for episodeIndex, episodeDetails in enumerate(episodesDetails):
if not episodeIndex % 2 == 0:
episodeNumber = (episodeIndex+1) / 2
if episodeIndex+1 < len(episodesDetails):
episodeSummary = episodesDetails[episodeIndex+1].text
episodeTitle = episodeDetails.find('td', {'class':'summary'}).text
if "Week" in episodeTitle:
weeklyDetails.append(episodeSummary)
# get the tab
else:
text = element.find(text=True)
if "Week" in str(text):
parsingSection = True
nextElementIndex = i+1
# parse through the next elements until we reach
# elements pertaining to the next week
while parsingSection:
if not (nextElementIndex < len(bodyElements)):
break
nextElement = bodyElements[nextElementIndex]
# if the next element is an h3 element then it is the header of the
# next we and we should stop here
if nextElement.name == 'h2' or nextElement.name == 'h3':
parsingSection = False
weeklyDetails.append(bodyElements[i+1:nextElementIndex])
# otherwise we go to the next element
else:
nextElementIndex += 1
competitionDetails[sn] = weeklyDetails
return competitionDetails
def getContestantCompData(competitionDetails, allContestants):
contestantCompData = {}
newerSeasons = [18, 19]
for sn in list(allContestants.keys()):
contestantsData = {}
# for each episode determine who had a group or one on one date and whether they got a rose
for weekIndex, weekDetails in enumerate(competitionDetails[sn]):
if sn in newerSeasons:
episodeSummary = weekDetails
hasIndividualDates = 'one-on-one' in episodeSummary.lower() or 'two-on-one' in episodeSummary.lower()
hasGroupDates = 'group date:' in episodeSummary.lower()
if hasIndividualDates or hasGroupDates:
for marker in ['one-on-one', 'group date:', 'two-on-one']:
dateType = marker.replace(' date:','')
if not marker in episodeSummary.lower():
continue
# find all indexes of the markers specified
markerIndexes = [m.end() for m in re.finditer(marker, episodeSummary.lower())]
for markerIndex in markerIndexes:
endOfContestants = markerIndex + episodeSummary[markerIndex:].index('. ')
contestants = episodeSummary[markerIndex:endOfContestants]
endOfSection = episodeSummary[markerIndex:].index('\n') + markerIndex
section = episodeSummary[markerIndex:endOfSection]
generateRoseData(allContestants[sn], contestantsData, weekIndex+1, dateType, contestants, section)
else:
for detailsIndex, details in enumerate(weekDetails):
if not(type(details.b) == NoneType):
dateType = details.b.text.lower()
if "one-on-one" in dateType or "two-on-one" in dateType or "group" in dateType:
start = details.text.index(':') +2
end = details.text.index('. ')
contestants = details.text[start:end]
generateRoseData(allContestants[sn], contestantsData, weekIndex+1, dateType, contestants, details.text)
contestantCompData[sn] = contestantsData
return contestantCompData
def generateRoseData(allContestants, contestantsData, week, dateType, contestantsOnDate, section):
# find the contestants that are on the date
contestants = []
for contestant in allContestants:
contestantFirstName = contestant['name'].split(' ')[0]
if contestantFirstName in contestantsOnDate:
contestants.append(contestantFirstName)
# account for any contestants with same name
contestants = dict(Counter(contestants))
contestantsToRemove = []
contestantsToAdd = []
for name in contestants:
if contestants[name] > 1:
contestantsToRemove.append(name)
for contestant in allContestants:
if name in contestant['name']:
fullName = contestant['name'].split(' ')
if len(fullName) == 3:
firstName, middleName, lastName = fullName
if '(' in lastName and ')' in lastName:
ref = lastName
lastName = middleName
searchableName = firstName+' '+ref
else:
firstName, lastName = fullName
searchableName = firstName+' '+lastName[0]
#searchableName2 = firstName+lastName[0]
if searchableName in contestantsOnDate:
contestantsToAdd.append(searchableName)
for name in contestantsToRemove:
del contestants[name]
for name in contestantsToAdd:
contestants[name] = 1
contestants = contestants.keys()
# determine who got a rose
if 'rose' in section:
roseIndex = section.index('rose')
endOfSentence = section[roseIndex:].index('.') + roseIndex
startOfSentence = endOfSentence - section[:endOfSentence][::-1].index('.') + 1
roseSentence = section[startOfSentence:endOfSentence]
for contestant in contestants:
contestantFirstName = contestant.split(' ')[0]
receivedRose = False
for got in ['got', 'receiv', 'gets', 'presents', 'gives','holds','has','giving', 'gave', 'extend']:
if got in roseSentence and not (' not ' in roseSentence):
if contestantFirstName in roseSentence:
receivedRose = True
break
elif 'one' in dateType: # we assume the pronoun refers to contestant on the date
receivedRose = True
break
elif 'group' in dateType: # we assume the pronoun refers to the last mentioned contestant
contestantIndex = {}
for c in contestants:
contestantIndex[c.split(' ')[0]] = section[:endOfSentence][::-1].index(c.split(' ')[0][::-1])
closestContestant = min(contestantIndex.iteritems(), key=operator.itemgetter(1))[0]
if closestContestant in contestant:
receivedRose = True
break
compData = (week, dateType, receivedRose)
if contestant in contestantsData:
contestantsData[contestant].append(compData)
else:
contestantsData[contestant] = [compData]
def getWeeklyCompData(contestantCompData):
result = {}
for sn in contestantCompData:
weeklyCompData = {}
for contestant in contestantCompData[sn]:
dates = contestantCompData[sn][contestant]
for week,dateType,receivedRose in dates:
if not week in weeklyCompData:
weeklyCompData[week] = {}
if not contestant in weeklyCompData[week]:
weeklyCompData[week][contestant] = {}
if not dateType in weeklyCompData[week][contestant]:
weeklyCompData[week][contestant][dateType] = {}
weeklyCompData[week][contestant][dateType] = receivedRose
result[sn] = weeklyCompData
return result
def addCompetitionData(seasonsDict, contestantCompData):
for sn in seasonsDict:
for contestant in seasonsDict[sn]:
contestant['group_dates'] = 0
contestant['individual_dates'] = 0
contestant['roses_from_group_dates'] = 0
contestant['roses_from_individual_dates'] = 0
for contestantFirstName in contestantCompData[sn]:
contestantName = "None"
if contestantFirstName == "" or " " in contestantFirstName or len(contestantFirstName) < 3:
continue
if contestantFirstName[-1].isupper():
contestantName = contestantFirstName[:-1] + ' ' + contestantFirstName[-1]
if contestantFirstName in contestant['name'] or contestantName in contestant['name']:
dates = contestantCompData[sn][contestantFirstName]
numGroupDates = 0
numIndividualDates = 0
numRoseOnGroupDates = 0
numRoseOnIndividualDates = 0
for week, dateType, receivedRose in dates:
if 'group' in dateType.lower():
numGroupDates += 1
if receivedRose:
numRoseOnGroupDates += 1
if 'one' in dateType.lower():
numIndividualDates += 1
if receivedRose:
numRoseOnIndividualDates += 1
contestant['group_dates'] = numGroupDates
contestant['individual_dates'] = numIndividualDates
contestant['roses_from_group_dates'] = numRoseOnGroupDates
contestant['roses_from_individual_dates'] = numRoseOnIndividualDates
break
# Get the details of each competition for each week
# compettionDetails is a dictionary: key = season number, val = list of weekly summary
competitionDetails = getCompetitionDetails(wikiPages, seasonsDict)
# Get the detailts of each contestants performances for each season
# contestantCompData is a dictionary: key = season number, val = dict with contestant
# name as keys (values are tuples = (week num [int], date type [str], received rose [bool]))
contestantCompData = getContestantCompData(competitionDetails, seasonsDict)
# Get each contestants performance for each week of each season
# weeklyCompData is a dict: key = season number, val = dict with week number as keys
# (values are dict: key = contestant name, val = dict: key=date type val=reeived rose (bool))
weeklyCompData = getWeeklyCompData(contestantCompData)
# Add the contestantCompData to contestant dictionaries in seasonsDict
addCompetitionData(seasonsDict, contestantCompData)
We now add the fundamental data for each Bachelor - i.e. where is he from, what is his job, how old is he.
# Add the men (the Bachelors) for all seasons.
#
# FIRST: get all data and add it to a dictionary, one for each bachelor:
# dictionary name = bachelorDict
# keys = name, age, hometown, occupation, elimination=bachelor, season
# values = associated values to fields, as scraped from wiki
#
# For example, here is the dictionary for the first bachelor (Season 1):
# {'name': 'Alex Michel', 'hometown': 'Charlottesville, Virginia',
# 'age': 32, 'season': '1', 'elimination': 'bachelor',
# 'occupation': 'Management consultant'}
#
# SECOND: add this data to the list that has data for all contestans and bachelors:
# listAllDicts - a list of all dicts for all cont and bachelors, all seasons
#
# Bachelors can be identified easily in this list because their 'elimination' column value
# is 'bachelor' (whereas contestants have 'elimination' column values 'runner-up', or
# '7' for week seven).
#
#go to wiki homepage for The Bachelor, make soup element
allseasons = requests.get("https://en.wikipedia.org/wiki/The_Bachelor_(U.S._TV_series)#Seasons")
soup = BeautifulSoup(allseasons.text, "html.parser")
#get the table cell that has links to each episode
men = soup.find("table", attrs={"class":"wikitable plainrowheaders"})
men = men.find_all("tr")
numRow = 0
for man in men: #for each bachelor in the table,
bachelorDict = dict() #init new dict for this bachelor
bachAge = "unknown" #default values for those we can't find
bachHometown = "unknown"
bachOccupation = "unknown"
if (numRow == 0): #skip first row - col titles
numRow += 1
continue
if (numRow > 19): #don't collect data on Season 20 Bachelor
break
numCol = 0
for col in man.find_all("td"): #for each col in this bachelor row,
#SEASON NUMBER
if (numCol == 0):
seasonNum = col.get_text()
seasonNum = seasonNum.encode('utf8')
if (numCol == 1): #get season year (to calculate age later)
col = col.get_text()
if ("[" in col):
seasonYear = col.encode('utf8')[-8:-4]
else:
seasonYear = col.encode('utf8')[-4:]
#Get URL for bachelor's personal site, make soup element
if (numCol == 2):
manURL = col.find("a").get("href")
manPage = requests.get("https://en.wikipedia.org" + manURL)
bachSoup = BeautifulSoup(manPage.text, "html.parser")
manSoup = bachSoup.find("table", attrs={"class":"infobox biography vcard"})
if (manSoup is None):
manSoup = bachSoup.find("table", attrs={"class":"infobox vcard"})
if (manSoup is not None):
manSoup = manSoup.find_all("tr")
#go to 'biography box' on bachelor's personal site
bioRow = 0
for row in manSoup:
#BACHELOR AGE
bornYear = row.find("span", attrs={"class":"bday"})
if (bornYear is not None):
bornYear = bornYear.get_text()
bornYear = bornYear.encode('utf8')[:4]
bachAge = int(seasonYear) - int(bornYear) #calculate age
#BACHELOR HOMETOWN
bachHome = row.find("span", attrs={"class":"birthplace"})
if (bachHome is None):
bachHome = row.find("td", attrs={"class":"birthplace"})
if (bachHome is not None):
bachHometown = bachHome.find("a") #.get("href")
bachHometown = str(bachHometown.contents[0])
if ("New York City" in bachHometown):
bachHometown = "New York City, New York"
bioRow += 1
#BACHELOR NAME
bachName = col.get_text()
bachName = bachName.encode('utf8')
if ("[" in bachName):
bachName = bachName[:-4]
#BACHELOR OCCUPATION
if (numCol == 3):
bachOccupation = col.get_text()
bachOccupation = bachOccupation.encode('utf8')
numCol += 1
#hard-code data for Bachelors who don't have own wiki page
if ("Grant" in bachName):
bachHometown = "London, UK"
bachAge = "27"
if ("Womack" in bachName):
bachHometown = "Austin, Texas"
bachAge = "37"
if ("Flajnik" in bachName):
bachHometown = "Sonoma, California"
bachAge = "28"
if ("Soules" in bachName):
bachHometown = "Arlington, Iowa"
bachAge = "33"
if ("Palmer" in bachName):
bachHometown = "Toronto, Ontario"
#add info to bachelor dictionary
bachelorDict['name'] = bachName
bachelorDict['age'] = bachAge
bachelorDict['hometown'] = bachHometown
bachelorDict['occupation'] = bachOccupation
bachelorDict['elimination'] = "bachelor"
bachelorDict['season'] = int(seasonNum.encode('utf8'))
#add bachelor dictionary to list of all dicts for all people in all seasons
listAllDicts.append(bachelorDict)
numRow +=1 #get next bachelor from wiki table
#Non-wiki Data sources:
#Grant: http://www.realitytvworld.com/news/matt-grant-dishes-on-upcoming-the-bachelor-london-calling-finale-7066.php
#Womack: http://www.people.com/people/article/0,,20429663,00.html
#Soules:http://www.people.com/article/chris-soules-new-bachelor-in-love
For earlier seasons, Wikipedia does not have the data - so we look to other websites to add in the data.
# Get data for Seasons 2, 4 and 6.
#
# Despite much effort, I could not get the text from the'realitytvword.com' sources below.
# The Beautiful Soup elements did not match the "inspect element" html tags.
# (Oddly, I was able to scrape from 'realitytv.about.com' for season 8 - see below.)
#
# I tried the following suggestions, but they did not work:
# https://www.reddit.com/r/learnpython/comments/2nqhzw/how_come_a_websites_page_source_html_is_different/
# http://stackoverflow.com/questions/26913316/beautiful-soup-doesnt-get-full-webpage
# To keep on schedule, I hand-entered the data. If time permits, I will come back to this.
#
# SEASON 2 SOURCES
# 1) http://www.realitytvworld.com/#$$nxtmgs&&BYitVosLEeWjwgrBiYTF8Q$$
# 2) winner: http://www.realitywanted.com/shows/the-bachelor/season-2
#
# SEASON 4 SOURCES
# 1) http://www.realitytvworld.com/#$$nxtgih&&Dvr38or5EeW1VRL/9wgFGw$$
# 2) http://draheid.com/archives/bachelor4/messages/1452262/1105037.html
# 3) winner: http://www.realitywanted.com/shows/the-bachelor/season-4
#
# SEASON 6 SOURCES
# 1) Source:(looks different in Safari versus Chrome) http://www.realitytvworld.com/#$$nxt6je&&9hhYJIraEeWixQqIPWP/qw$$
# 2) Alt. Source: http://www.realitytvworld.com/news/abc-releases-identities-of-sixth-bachelor-edition-bachelorettes-2880.php")
# 3) FYI, source without occupations, but with pictures:
# (http://community.realitytvworld.com/cgi-sys/cgiwrap/rtvw2/community/dcboard.cgi?az=printer_format&om=894&forum=DCForumID42)
# 4) Source of winner name: http://www.realitywanted.com/shows/the-bachelor/season-6
# 5) Mary Delgado source: http://www.sptimes.com/2003/09/26/Tampabay/No_wedding_bells__jus.shtml
#
#make array of data for contestants in Season 2
season2 = ["Heather, a 23-year-old sales coordinator who currently resides in Walnut Creek, CA",
"Lori, a 26-year-old public relations representative who currently resides in Dallas, TX",
"Heather, a 30-year-old flight attendant who currently resides in Watauga, TX",
"Amber, a 25-year-old therapist who currently resides in Chapel Hill, NC",
"Cari, a 28-year-old elementary school teacher who currently resides in Granite City, IL",
"Christy, a 24-year-old radiologic technologist who currently resides in Avondale, AZ",
"Hayley, a 28-year-old store manager who currently resides in Dana Point, CA",
"Camille, a 29-year-old actress/model who currently resides in Los Angeles, CA",
"Kyla Faye, a 22-year-old recording artist who currently resides in Midvale, UT",
"Erin, a 25-year-old national magazine who currently resides in Chester, PA",
"Frances, a 30-year-old strategic planning analyst who currently resides in San Francisco, CA",
"Dana, a 24-year-old radio sales who currently resides in Beverly Hills, CA",
"Merrilee, a 27-year-old teacher who currently resides in Forked River, NJ",
"Suzi, a 27-year-old communications specialist who currently resides in Richmond, VA",
"Anindita, a 27-year-old attorney who currently resides in New York, NY",
"Fatima, a 22-year-old student who currently resides in Long Beach, CA",
"Helene Eksterowicz, a 27-year-old school psychologist who currently resides in Glouchester, NJ",
"Brooke Nicole, a 22-year-old student who currently resides in Tuscaloosa, AL",
"Liangy, a 30-year-old paralegal who currently resides in Coral Gables, FL",
"Erin, a 23-year-old interior designer who currently resides in Houston, TX",
"Suzanne, a 32-year-old flight attendant who currently resides in Redondo Beach, CA",
"Angela, a 26-year-old registered nurse who currently resides in Kansas City, MO",
"Shannon, a 25-year-old graphic artist who currently resides in Hicksville, NY",
"Christi Diane, a 23-year-old financial advisors asst. who currently resides in Eagle, ID",
"Gwen, a 31-year-old executive recruiter who currently resides in Chester Springs, PA"]
#make array of data for contestants in Season 4
season4= ["Brooke, a 24-year-old Teacher who currently resides in Bartlett, TN",
"Lee-Ann, a 24-year-old Second Grade Teacher who currently resides in Athens, GA",
"Shea, a 25-year-old Firefighter who currently resides in Shreveport, LA",
"Mary, a 35-year-old Sales Manager who currently resides in Tampa, FL",
"Lindsay, a 23-year-old Professional Dancer who currently resides in Los Angeles, CA",
"Estella Gardinier, a 27-year-old Mortgage Broker who currently resides in Beverly Hills, CA",
"Lanah, a 27-year-old Event Coordinator who currently resides in Poolesville, MD",
"Jenny, a 30-year-old Marketing Director who currently resides in Austin, TX",
"Kristi, a 24-year-old Loan Processor who currently resides in Chicago, IL",
"Lindsay, a 25-year-old Pharmaceutical Sales who currently resides in Mauldin, SC",
"Shelly, a 26-year-old Pharmaceutical Sales who currently resides in Wanwatosa, WI",
"Kelly Jo, a 23-year-old Director of Community Relations who currently resides in Kalamazoo, MI",
"Antoinette, a 30-year-old Senior Account Manager who currently resides in Philadelphia, PA",
"Stacey, 26-year-old a Hair Stylist who currently resides in Massillon, OH",
"Heather, a 24-year-old Recent College Graduate who currently resides in Chicago, IL",
"Meredith, a 29-year-old Model/ Makeup Artist who currently resides in West Hollywood, CA",
"Misty, a 23-year-old Radio Promotions Assistant who currently resides in Dallas, TX",
"Christine, a 24-year-old Administrative Assistant who currently resides in Corona, CA",
"Jenn, a 26-year-old Elementary School Teacher who currently resides in La Jolla, CA",
"Leona, a 25-year-old Realtor's Assistant who currently resides in Chicago, IL",
"Samantha, a 25-year-old Kitchen Designer who currently resides in Chicago, IL",
"Julie, a 29-year-old Sales/ Modeling who currently resides in Louisville, KY",
"Karin, a 32-year-old Mortgage Consultant who currently resides in Brooklyn Park, MN",
"Lauren, a 24-year-old Retail Buyer who currently resides in Redondo Beach, CA",
"Darla, a 26-year-old Attorney who currently resides in Gainesville, FL"]
#make array of data for contestants in Season 6
season6 = ["Abby, a 29-year-old acrobat who currently resides in Henderson, NV",
"Alma Rubenstein, a 35-year-old cafe owner who currently resides in Astoria, OR",
"Amanda, a 27-year-old cosmetics buyer who currently resides in New York, NY",
"Amy, a 27-year-old marketing consultant who currently resides in San Diego, CA",
"Andrea, a 33-year-old dental hygienist who currently resides in Denver, CO",
"Ashley, a 31-year-old teacher who currently resides in Santa Barbara, CA",
"Carolyn, a 36-year-old financial advisor who currently resides in Tulsa, OK",
"Cheresse, a 31-year-old advertising director who currently resides in St. Louis, MO",
"Cynthia, a 37-year-old charity foundations director who currently resides in Hermosa Beach, CA",
"Elizabeth, a 28-year-old in pharmaceutical sales who currently resides in Chicago, IL",
"Jayne, a 37-year-old dog groomer, who currently resides in Key Largo, FL",
"Jennifer, a 31-year-old account executive who currently resides in Seattle, WA",
"Kelly, a 34-year-old actress who currently resides in Beverly Hills, CA",
"Kerry, a 31-year-old nurse who currently resides in San Francisco, CA",
"Kristie, a 32-year-old bar owner who currently resides in Windsor, Canada",
"Kristin, a 27-year-old office manager who currently resides in Pensacola, FL",
"Krysta, a 28-year-old financial analyst who currently resides in Oklahoma City, OK",
"Leina, a 28-year-old advertising associate who currently resides in Chula Vista, CA",
"Lisa, a 33-year-old teacher who currently resides in West Palm Beach, FL",
"Melinda, a 39-year-old photographer who currently resides in Nashville, TN",
"Natalie, a 34-year-old in retail sales who currently resides in Santa Monica, CA",
"Nicole, a 28-year-old executive recruiter who currently resides in Libertyville, IL",
"Susie, a 32-year-old insurance broker who currently resides in Hollywood, CA",
"Tanya, a 31-year-old teacher who currently resides in Plano, Texas",
"Wende, a 28-year-old model who currently resides in Austin, Texas",
"Mary Delgado, a 35-year-old real estate agent who currently resides in Tampa Bay, FL"]
# Add contestant data for seasons 2, 4 and 6 to dictionary 'contestantDict'.
#
# param : array of strings with contestant data
# param : season number
# param : winner name
def addNonWikiData(contestantArray, seasonNum, winnerName):
for line in contestantArray:
firstComma = line.index(',') #parse string
startAge = line.index(" a ")
jobTag = "year-old "
startJob = line.index(jobTag)
homeTag = "currently resides in "
startHome = line.index(homeTag)
contestantDict = dict() #init new dict for contestant
contestantDict['name'] = line[:firstComma] #put field data into dictionary
contestantDict['age'] = line[startAge+3:startAge+5]
contestantDict['hometown'] = line[startHome + len(homeTag):]
contestantDict['occupation'] = line[startJob + len(jobTag):line.index("who")-1]
contestantDict['season'] = seasonNum
if (winnerName in line): #if this is the Winner,
contestantDict['elimination'] = "Winner" #add 'winner' to 'elimination' field
else:
contestantDict['elimination'] = "unknown"
listOfContestantDicts.append(contestantDict) #add dict to list of dicts for this season
listAllDicts.append(contestantDict) #add dict to list of all dicts in all seasons
seasonsDict[seasonNum] = listOfContestantDicts #key = season, val=list of contestant dicts
#add contestant data for seasons 4 and 6 to the contestant dictionary
addNonWikiData(season2, 2, "Eksterowicz")
addNonWikiData(season4, 4, "Gardinier")
addNonWikiData(season6, 6, "Delgado")
# Get data for Season 8, add to dictionary
#get site with season 8 contestants, make soup element
seasonEight = requests.get("http://realitytv.about.com/od/thebachelor8/ig/Ladies-of-The-Bachelor--Paris/") #get site
season8= BeautifulSoup(seasonEight.text, "html.parser")
#get the table cell that has links to each episode
eight = season8.find("body", attrs={"id":"imagegalleryIndexPage"})
eight = eight.find("main", attrs={"id":"main"})
eight = eight.find("div", attrs={"class":"container"})
eight = eight.find_all("div", attrs={"class":"row"})[1]
eight = eight.find("div", attrs={"class":"col col-11"}).find("div", attrs={"class":"row"})
eight = eight.find("div", attrs={"class":"col col-8"})
eight = eight.find("div", attrs={"class":"content widget gallery-index-content"})
eight = eight.find("ul")
urls8 = [] #list of urls for season 8 contestant pages
for item in eight.find_all("li", attrs={"itemtype":"http://schema.org/ImageObject"}):#for each contestant in list of season 8 contestants
url8 = item.find("a") #get url tag
if url8 is not None: #if has url link, get url
urls8.append("\"http://realitytv.about.com" + url8.get("href") + "\"")
#add contestant site leftover from next page
urls8.append("\"http://realitytv.about.com/od/thebachelor8/ig/Ladies-of-The-Bachelor--Paris/Shiloh-of-The-Bachelor--Paris.htm\"")
cont8Sites = [] #list of soup objects for season 8 contestant sites
for link in urls8:
site8 = requests.get(link[1:-1])
soup8 = BeautifulSoup(site8.text, "html.parser") #get soup element
cont8Sites.append(soup8) #add soup element to list
for cont8 in cont8Sites: #for each soup element (one per contestant site),
c8 = cont8.find("body", attrs={"id":"imagegalleryPage"}) #find data
c8 = c8.find("main", attrs={"class":"slab"})
c8 = c8.find("div", attrs={"class":"container"})
c8 = c8.find_all("div", attrs={"class":"row"})[1]
c8 = c8.find("div", attrs={"class":"col col-11"})
c8 = c8.find("div", attrs={"id":"contentIntro"})
c8 = c8.find("div", attrs={"class":"row"})
c8 = c8.find("div", attrs={"class":"col col-6"})
c8 = c8.find("div", attrs={"class":"muted subheading"}).getText()
contestantDict = dict() #init new dict for contestant
#get name
firstComma = c8.index(',')
contestantDict['name'] = c8[:firstComma]
#get age
substrC8 = c8[firstComma+2:]
secondComma = substrC8.index(',')
contestantDict['age'] = substrC8[:secondComma]
#get hometown
hometag = "resides in "
if (hometag not in c8):
hometag = "living in "
homeIndex = c8.index(hometag)
contestantDict['hometown'] = c8[(homeIndex+len(hometag)):-1]
#get job
jobtag = "is a "
endjobtag = " who"
if ("is an" in c8):
jobtag = "is an "
if("works in" in c8): #has format "Tara, 23, works in X and currently resides in Y"
jobtag = "works in "
endjobtag = " and currently resides"
if("is the" in c8):
jobtag = "is the "
endjobtag = " and currently resides"
if (endjobtag not in c8):
endjobtag = " living in"
contestantDict['occupation'] = c8[(c8.index(jobtag)+len(jobtag)):(c8.index(endjobtag))] #add name to dict
#get elimination week
if ("Sarah Stone" in name): #hard-code season 8 winner
contestantDict['elimination'] = "Winner"
else:
contestantDict['elimination'] = "unknown"
#add season
contestantDict['season'] = 8
#add dict to list of dicts
#if (contestantDict not in newList):
listOfContestantDicts.append(contestantDict) #add dict to list of dicts in this season
listAllDicts.append(contestantDict) #add dict to list of all dicts in ALL seasons
seasonsDict[8] = listOfContestantDicts #key = season num, val=list of contestant dicts
#Save to Disk
#import json
#fd = open("tempdata/seasonsDict.json", "w") #save dictionary to disk
#json.dump(seasonsDict, fd)
#fd.close()
#del seasonsDict
#with open("tempdata/seasonsDict.json", "r") as fd:
# seasonsDict = json.load(fd) #reload
# Convert list of contestant dictionaries to a pandas dataframe.
#
# Note: 'listAllDicts' has seasons = [2,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19]
# Here, we make a dataframe of Seasons 13 forward.
#
cDicts = []
for l in listAllDicts:
if (l['season'] > 12): #for seasons 13 forward,
d={}
d['name']=l['name']
d['age']=l['age']
d['hometown']=l['hometown']
d['occupation']=l['occupation']
d['elimination week']=l['elimination'] #for bachelors, value will be 'bachelor'
if 'group_dates' in l and 'individual_dates' in l:
d['group_dates'] = l['group_dates']
d['individual_dates'] = l['individual_dates']
d['roses_from_group_dates'] = l['roses_from_group_dates']
d['roses_from_individual_dates'] = l['roses_from_individual_dates']
else:
d['group_dates'] = 0
d['individual_dates'] = 0
d['roses_from_group_dates'] = 0
d['roses_from_individual_dates'] = 0
d['season']=l['season']
cDicts.append(d)
contestantDF = pd.DataFrame(cDicts)
contestantDF.drop_duplicates() #drop duplicates, just in case
contestantDF.head(5)
import json
with open('competition_data.json', 'w') as fp:
json.dump(weeklyCompData, fp)