Wikipedia Scrape

Here we scrape wikipedia pages that contain information about each season of The Bachelor. We are interested in the following fundamental data types: Age, Profession, Hometown. Additionally, for each week of the compeition, we scrape to find out if a contestant received a group date, a group rose, or a one-on-one date.

In [2]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests
from competition_data import *

Here we get the raw HTML from each page

In [3]:
#get wiki for all bachelor seasons
allseasons = requests.get("https://en.wikipedia.org/wiki/The_Bachelor_(U.S._TV_series)#Seasons")
soup = BeautifulSoup(allseasons.text, "html.parser") #make soup element

#get the table cell that has links to each episode
seasons = soup.find("table", attrs={"class":"navbox"}).find("td", attrs={"class":"navbox-list navbox-odd hlist"})
seasons = seasons.find("div", attrs={"style":"padding:0em 0.25em"}).find("ul")

urls = []                           #list of links to season-specific page
seasonNums = []                     #list of seasons w/ wiki pages (no seasons 1-4 or 6-8)
seasonNum = 1                       #season number
for item in seasons.find_all("li"): #for each item in list of seasons
    if (seasonNum == 20):           #don't include season 20, b/c no contestants listed yet
        break
    season = item.find("a")         #get url tag
    if season is not None:          #if has url link, get url text
        urls.append("\"https://en.wikipedia.org" + season.get("href") + "\"")
        seasonNums.append(seasonNum) #add season number to list 
    seasonNum += 1
    
wikiPageText = []                   #init list of wiki site text, for all seasons
for url in urls:
    site = requests.get(url[1:-1])  #get web-site for that url
    soup = BeautifulSoup(site.text, "html.parser") #make BS element
    wikiPageText.append(soup)       #add web-site text to list

wikiPages = dict(zip(seasonNums, wikiPageText)) #key=season, val=Soup Elem(wiki page text)

Now we parse through the tags of the HTML to get the fundamental data sources we are looking for.

In [4]:
# For each season in wiki, make list of dictionaries - one dictionary for each contestant.
# Also, make a list of dictionaries of all contestants for all seasons.
#
# list name       = listAllDicts  #a list of all dicts for all contestants and bachelors, all seasons
#
# dictionary name = seasonsDict
#             key = season number
#           value = list of dictionaries for that season (one for each contestant)
#             
# dictionary name = contestantDict
#            keys = name, age, hometown, occupation, elimination, season
#          values = associated values to fields, as scraped from wiki
#
# To test contestant dictionaries:
#         print seasonsDict[season][contestant][fieldname]
#    eg:  print seasonsDict[9][10]['name']  -- get name for season 9, contestant 10
#
# Note: Wiki does not have pages dedicated to Seasons 1-4, or 6-8. I added 2, 4, 6, and 8
# below, from other sources. Contestants for episode 20 are not added, because they are
# not public yet.
#
import sys  
reload(sys)  
sys.setdefaultencoding('utf8')

seasonsDict = dict()                #key = season num, val=list of contestant dictionaries
allContestants = dict()             #keys = name/age/etc, values = associated data
listAllDicts = []                   #list of dicts for all cont. and bach. for ALL seasons

for sn in seasonNums:
    seasonPage = wikiPages[sn]      #get BS element for this season
    seasonPage = seasonPage.find("div", attrs={"id":"content"}).find("div", attrs={"id":"bodyContent"})
    seasonPage = seasonPage.find("div", attrs={"id":"mw-content-text"})
    seasonPage = seasonPage.find("table", attrs={"class":"wikitable sortable"})
    
    listOfContestantDicts = []          #list of dicts for each contestant
    
    numtr = 0                           #num rows (one per contestant)
    for tr in seasonPage.find_all("tr"):#for each contestant listed,
        if (numtr == 0):                #skip first row (column headers)
            numtr += 1
            continue

        contestantDict = dict()         #init new dict for contestant
        numtd = 0                       #column number
        for td in tr.find_all("td"):    #for each column of data,
            
            #NAME
            if (numtd == 0):
                name = str(td.contents)
                if ("<b>" in name):
                    td.find("b")
                    name = str(td.contents)[4:-5]
                if (("[u'" in name) or ("[u\"" in name)):   #if "[u'name']",
                    name = name.encode('utf8')[3:-2]    #format to get 'name'
                if ("<span class" in name):
                    td.find("span", attrs={"class":"nowrap"})
                    tag = "<span class='nowrap'>"       #start tag before name
                    name = str(td.contents)[len(tag)+1:]#cut out start tag
                    end = name.index("<")               #get start point of end tag
                    name = name[:end]                   #cut out end tag
                    trashTag = "style=\"display:none;\">" #weird tag to cut from a name
                    if (trashTag in name):
                        name = name[(len(trashTag)+1):-1] 
                if ("<sup" in name):                    #if name has "name', <sup ...",
                    end2 = name.index("<sup")           #format to get name
                    name = name[:end2-3]
                if ("</b" in name):
                    name = name[:name.index("</b")]
                if ("href" in name):                    #if name has url
                    name = td.find("a").get("href")
                    name = td.get_text("title")
                if ("title" in name):                   #if 'title' in name, take it out
                    name = name[:name.index("title")]   #eg "Keltie Colleentitle[title20title]"
                contestantDict['name'] = name           #add name to dict
                
            #AGE
            if (numtd == 1):
                age = str(td.contents)
                if ("<b>" in age):
                    td.find("b")
                    age = str(td.contents)[4:-5]
                if ("[u'" in age):                      
                    age = age.encode('utf8')[3:5]
                if (age is None):                   #if no age (eg season 9, Cosetta Blanca)
                    age = "na"
                contestantDict['age'] = age
                
            #HOME
            if (numtd == 2):
                home = ""
                for url in td.find_all("a"):        #for each url to a place,
                    url.get("href")
                    home2 = url.get_text("title")   #get place name
                    if (len(home) > 0):             #if already have city,
                        home = home + ", " + home2  #concatenate state
                    else:                           #if no city yet (or home is one word),
                        home = home2                #save city name or home name

                if ("title" in home):               #format oddity in season 19, contest 1
                    indx = home.index("title")
                    home = home[:indx]
                if ("[" in home):                   #format oddity - homes end in ", ["
                    indx2 = home.index("[")         
                    home = home[:indx2-2]
                
                contestantDict['hometown'] = home

                
            #OCCUPATION
            if (numtd == 3):
                job = str(td.contents)
                if ("<b>" in job):
                    td.find("b")
                    job = str(td.contents)[4:-5]
                if (("[u'" in job) or ("[u\"" in job)):               
                    job = job.encode('utf8')[3:-2]  
                if ("href" in job):                 #if occupation has url
                    job = td.find("a").get("href")
                    job = td.get_text("title")
                if ("nowrap" in job):
                    job = td.get_text("span") #, attr={"class":"nowrap"})
                if ("title" in job):
                    titleindex = job.index("title")
                    job = job[:titleindex] + " " + job[(titleindex+len("title")):]
                if ("title" in job):                #sometimes, 'title' appears twice in 'occupation'
                    titleindex = job.index("title")
                    job = job[:titleindex] + " " + job[(titleindex+len("title")):]
                if ("below" in job):
                    job = "unknown"
                contestantDict['occupation'] = job   
                
            #ELIMINATION
            if (numtd == 4):
                elim = str(td.contents)
                if ("<b>" in elim):
                    td.find("b")
                    elim = str(td.contents)[4:-5]
                if ("[u'" in elim):                      
                    elim = elim.encode('utf8')[3:-2] 
                if("Eliminated in " in elim):
                    elim = elim[len("Eliminated in "):]
                if(("Quit in " in elim) or ("quit in " in elim)):
                    elim = elim[len("Quit in "):]
                if(("Week " in elim) or ("week " in elim)):  #remove "week", leave week number only
                    elim = elim[len("Week "):]
                if (("Returned" in elim) or ("returned" in elim)):
                    elim = elim[:elim.index("', <br/>")]
                contestantDict['elimination'] = elim

            numtd += 1
        numtr += 1
        
        contestantDict['season'] = sn   #include season num in dict
        
        listOfContestantDicts.append(contestantDict) #add dict to list of dicts in this season
        listAllDicts.append(contestantDict)         #add dict to list of all dicts in all seasons
    seasonsDict[sn] = listOfContestantDicts  #key = season num, val=list of contestant dicts

Now we get the "competitive" data - i.e. whether a contestant received a rose in a given week.

In [ ]:
from types import *
from collections import Counter
import operator
import re

def getCompetitionDetails(wikiPages, seasons):
    competitionDetails = dict()
    newerSeasons = [18,19] # these season have a different page layout and must be handled differently
    for sn in seasons:
        weeklyDetails = []
        seasonPage = wikiPages[sn]      #get BS element for this season
        seasonPage = seasonPage.find("div", attrs={"id":"content"}).find("div", attrs={"id":"bodyContent"})
        seasonPage = seasonPage.find("div", attrs={"id":"mw-content-text"})
        bodyElements = seasonPage.find_all(['h2','h3','p','table']) #put body elements into a list so they can be parsed through
        

        # iterater over the list of body elements to find the relevent data
        for i, element in enumerate(bodyElements):
            if sn in newerSeasons:
                # find the episodes section
                if element.name == 'h2' and not type(element.span) == NoneType:
                    if element.span.text == "Episodes":
                        # capture each table row into the episodesDetail list excluding 
                        # the first tr, which is the table header.
                        # the rows are arranged in pairs, with the first the row containing 
                        # the episode num, title and air date and the second row containing
                        # the episode summary
                        episodesDetails = bodyElements[i+1].find_all('tr')
                        for episodeIndex, episodeDetails in enumerate(episodesDetails):
                            
                            
                            if not episodeIndex % 2 == 0:
                                episodeNumber = (episodeIndex+1) / 2
                                if episodeIndex+1 < len(episodesDetails):
                                    episodeSummary = episodesDetails[episodeIndex+1].text
                                    episodeTitle = episodeDetails.find('td', {'class':'summary'}).text
                                    if "Week" in episodeTitle:
                                        weeklyDetails.append(episodeSummary)
                                    
                # get the tab
            else:
                text = element.find(text=True)
                if "Week" in str(text):
                    parsingSection = True
                    nextElementIndex = i+1
                    # parse through the next elements until we reach 
                    # elements pertaining to the next week
                    while parsingSection:
                        if not (nextElementIndex < len(bodyElements)):
                            break

                        nextElement = bodyElements[nextElementIndex]
                        # if the next element is an h3 element then it is the header of the 
                        # next we and we should stop here
                        if nextElement.name == 'h2' or nextElement.name == 'h3':
                            parsingSection = False
                            weeklyDetails.append(bodyElements[i+1:nextElementIndex])
                        # otherwise we go to the next element
                        else:
                            nextElementIndex += 1


        competitionDetails[sn] = weeklyDetails 

    return competitionDetails



def getContestantCompData(competitionDetails, allContestants):
    contestantCompData = {} 
    newerSeasons = [18, 19]
    
    for sn in list(allContestants.keys()):
        contestantsData = {}
        # for each episode determine who had a group or one on one date and whether they got a rose 
        for weekIndex, weekDetails in enumerate(competitionDetails[sn]):
            if sn in newerSeasons:
                episodeSummary = weekDetails
                hasIndividualDates = 'one-on-one' in episodeSummary.lower() or 'two-on-one' in episodeSummary.lower()
                hasGroupDates = 'group date:' in episodeSummary.lower()
                
                if hasIndividualDates or hasGroupDates:
                    for marker in ['one-on-one', 'group date:', 'two-on-one']:
                        dateType = marker.replace(' date:','') 
                        if not marker in episodeSummary.lower():
                            continue
                        # find all indexes of the markers specified
                        markerIndexes = [m.end() for m in re.finditer(marker, episodeSummary.lower())]
                        for markerIndex in markerIndexes:                        
                            endOfContestants = markerIndex + episodeSummary[markerIndex:].index('. ')
                            contestants = episodeSummary[markerIndex:endOfContestants]
                            endOfSection = episodeSummary[markerIndex:].index('\n') + markerIndex
                            section = episodeSummary[markerIndex:endOfSection]
                            generateRoseData(allContestants[sn], contestantsData, weekIndex+1, dateType, contestants, section)

            else:
                for detailsIndex, details in enumerate(weekDetails):
                    if not(type(details.b) == NoneType):
                        dateType = details.b.text.lower()
                        if "one-on-one" in dateType or "two-on-one" in dateType or "group" in dateType:
                            start = details.text.index(':') +2
                            end = details.text.index('. ')
                            contestants = details.text[start:end]
                            generateRoseData(allContestants[sn], contestantsData, weekIndex+1, dateType, contestants, details.text)
                                    
        contestantCompData[sn] = contestantsData
        
    return contestantCompData
  
def generateRoseData(allContestants, contestantsData, week, dateType, contestantsOnDate, section):

    # find the contestants that are on the date 
    contestants = []
    for contestant in allContestants:
        contestantFirstName = contestant['name'].split(' ')[0]
        if contestantFirstName in contestantsOnDate:
            contestants.append(contestantFirstName)
    # account for any contestants with same name
    contestants = dict(Counter(contestants))
    contestantsToRemove = []
    contestantsToAdd = []
    for name in contestants:
        if contestants[name] > 1:
            contestantsToRemove.append(name)
            for contestant in allContestants:
                if name in contestant['name']:
                    fullName = contestant['name'].split(' ')
                    if len(fullName) == 3:
                        firstName, middleName, lastName = fullName
                        if '(' in lastName and ')' in lastName:
                            ref = lastName
                            lastName = middleName
                            searchableName = firstName+' '+ref
                    else:
                        firstName, lastName = fullName
                        searchableName = firstName+' '+lastName[0]
                        #searchableName2 = firstName+lastName[0]
                    if searchableName in contestantsOnDate:
                        contestantsToAdd.append(searchableName)

    for name in contestantsToRemove:
        del contestants[name]

    for name in contestantsToAdd:
        contestants[name] = 1

    contestants = contestants.keys()
    

    # determine who got a rose
    if 'rose' in section:
        roseIndex = section.index('rose')
        endOfSentence = section[roseIndex:].index('.') + roseIndex
        startOfSentence = endOfSentence - section[:endOfSentence][::-1].index('.') + 1
        roseSentence = section[startOfSentence:endOfSentence]
        
        for contestant in contestants:
            contestantFirstName = contestant.split(' ')[0]
            receivedRose = False
            for got in ['got', 'receiv', 'gets', 'presents', 'gives','holds','has','giving', 'gave', 'extend']:
                if got in roseSentence and not (' not ' in roseSentence):
                    if contestantFirstName in roseSentence:
                        receivedRose = True
                        break
                    elif 'one' in dateType: # we assume the pronoun refers to contestant on the date
                            receivedRose = True
                            break
                    elif 'group' in dateType: # we assume the pronoun refers to the last mentioned contestant
                        contestantIndex = {}
                        for c in contestants:
                            contestantIndex[c.split(' ')[0]] = section[:endOfSentence][::-1].index(c.split(' ')[0][::-1])

                        closestContestant = min(contestantIndex.iteritems(), key=operator.itemgetter(1))[0]
                        if closestContestant in contestant:
                            receivedRose = True
                            break
                            
            compData = (week, dateType, receivedRose)

            if contestant in contestantsData:
                contestantsData[contestant].append(compData)
            else:
                contestantsData[contestant] = [compData] 
def getWeeklyCompData(contestantCompData):
    result = {}
    for sn in contestantCompData:
        weeklyCompData = {}
        for contestant in contestantCompData[sn]: 
            dates = contestantCompData[sn][contestant]
            for week,dateType,receivedRose in dates:
                if not week in weeklyCompData:
                    weeklyCompData[week] = {}
                if not contestant in weeklyCompData[week]:
                    weeklyCompData[week][contestant] = {}
                if not dateType in weeklyCompData[week][contestant]:
                    weeklyCompData[week][contestant][dateType] = {}

                weeklyCompData[week][contestant][dateType] = receivedRose
        result[sn] = weeklyCompData
    return result

def addCompetitionData(seasonsDict, contestantCompData):
    for sn in seasonsDict:    
        for contestant in seasonsDict[sn]:
            contestant['group_dates'] = 0
            contestant['individual_dates'] = 0
            contestant['roses_from_group_dates'] =  0
            contestant['roses_from_individual_dates'] = 0
            for contestantFirstName in contestantCompData[sn]:
                contestantName = "None"
                if contestantFirstName == "" or " " in contestantFirstName or len(contestantFirstName) < 3:
                    continue
                if contestantFirstName[-1].isupper():
                    contestantName = contestantFirstName[:-1] + ' ' + contestantFirstName[-1]

                if contestantFirstName in contestant['name'] or contestantName in contestant['name']:
                    dates = contestantCompData[sn][contestantFirstName]
                    numGroupDates = 0
                    numIndividualDates = 0
                    numRoseOnGroupDates = 0
                    numRoseOnIndividualDates = 0
                    for week, dateType, receivedRose in dates:
                        if 'group' in dateType.lower():
                            numGroupDates += 1
                            if receivedRose:
                                numRoseOnGroupDates += 1
                        if 'one' in dateType.lower():
                            numIndividualDates += 1
                            if receivedRose:
                                numRoseOnIndividualDates += 1
                        
                    
                    contestant['group_dates'] = numGroupDates
                    contestant['individual_dates'] = numIndividualDates
                    contestant['roses_from_group_dates'] =  numRoseOnGroupDates
                    contestant['roses_from_individual_dates'] = numRoseOnIndividualDates
                    break
In [5]:
# Get the details of each competition for each week
# compettionDetails is a dictionary: key = season number, val = list of weekly summary 
competitionDetails = getCompetitionDetails(wikiPages, seasonsDict) 
# Get the detailts of each contestants performances for each season
# contestantCompData is a dictionary: key = season number, val = dict with contestant 
# name as keys (values are tuples = (week num [int], date type [str], received rose [bool]))
contestantCompData = getContestantCompData(competitionDetails, seasonsDict)
# Get each contestants performance for each week of each season
# weeklyCompData is a dict: key = season number, val = dict with week number as keys 
# (values are dict: key = contestant name, val = dict: key=date type val=reeived rose (bool))
weeklyCompData = getWeeklyCompData(contestantCompData)
# Add the contestantCompData to contestant dictionaries in seasonsDict
addCompetitionData(seasonsDict, contestantCompData)

We now add the fundamental data for each Bachelor - i.e. where is he from, what is his job, how old is he.

In [6]:
# Add the men (the Bachelors) for all seasons. 
#
# FIRST: get all data and add it to a dictionary, one for each bachelor:
#     dictionary name = bachelorDict
#                keys = name, age, hometown, occupation, elimination=bachelor, season
#              values = associated values to fields, as scraped from wiki
#
#     For example, here is the dictionary for the first bachelor (Season 1):
#          {'name': 'Alex Michel', 'hometown': 'Charlottesville, Virginia', 
#          'age': 32, 'season': '1', 'elimination': 'bachelor', 
#          'occupation': 'Management consultant'}
#
# SECOND: add this data to the list that has data for all contestans and bachelors:
#      listAllDicts    -   a list of all dicts for all cont and bachelors, all seasons
#
# Bachelors can be identified easily in this list because their 'elimination' column value 
# is 'bachelor' (whereas contestants have 'elimination' column values 'runner-up', or 
# '7' for week seven).
#

#go to wiki homepage for The Bachelor, make soup element
allseasons = requests.get("https://en.wikipedia.org/wiki/The_Bachelor_(U.S._TV_series)#Seasons")
soup = BeautifulSoup(allseasons.text, "html.parser") 

#get the table cell that has links to each episode
men = soup.find("table", attrs={"class":"wikitable plainrowheaders"})
men = men.find_all("tr")

numRow = 0
for man in men:                                 #for each bachelor in the table,
    bachelorDict = dict()                       #init new dict for this bachelor
    bachAge = "unknown"                         #default values for those we can't find
    bachHometown = "unknown"
    bachOccupation = "unknown"
    if (numRow == 0):                           #skip first row - col titles
        numRow += 1
        continue
    if (numRow > 19):                           #don't collect data on Season 20 Bachelor
        break
        
    numCol = 0
    for col in man.find_all("td"):              #for each col in this bachelor row,       

        #SEASON NUMBER
        if (numCol == 0):                              
            seasonNum = col.get_text()
            seasonNum = seasonNum.encode('utf8')
        
        if (numCol == 1):                       #get season year (to calculate age later)
            col = col.get_text()
            if ("[" in col):
                seasonYear =  col.encode('utf8')[-8:-4]
            else:
                seasonYear =  col.encode('utf8')[-4:]
        
        #Get URL for bachelor's personal site, make soup element 
        if (numCol == 2):                               
            manURL = col.find("a").get("href")
            manPage = requests.get("https://en.wikipedia.org" + manURL)
            bachSoup = BeautifulSoup(manPage.text, "html.parser") 
            manSoup = bachSoup.find("table", attrs={"class":"infobox biography vcard"})
            if (manSoup is None):
                manSoup = bachSoup.find("table", attrs={"class":"infobox vcard"})
            if (manSoup is not None):
                manSoup = manSoup.find_all("tr")
            
                #go to 'biography box' on bachelor's personal site
                bioRow = 0
                for row in manSoup:
                    #BACHELOR AGE
                    bornYear = row.find("span", attrs={"class":"bday"})
                    if (bornYear is not None):
                        bornYear = bornYear.get_text()
                        bornYear = bornYear.encode('utf8')[:4]
                        bachAge = int(seasonYear) - int(bornYear)   #calculate age
                    
                    #BACHELOR HOMETOWN
                    bachHome = row.find("span", attrs={"class":"birthplace"})
                    if (bachHome is None):
                        bachHome = row.find("td", attrs={"class":"birthplace"})
                    if (bachHome is not None):
                        bachHometown = bachHome.find("a") #.get("href")
                        bachHometown = str(bachHometown.contents[0])
                    if ("New York City" in bachHometown):
                        bachHometown = "New York City, New York"

                    bioRow += 1

            #BACHELOR NAME
            bachName = col.get_text()
            bachName = bachName.encode('utf8')
            if ("[" in bachName):
                bachName = bachName[:-4]
                

        #BACHELOR OCCUPATION
        if (numCol == 3):                               
            bachOccupation = col.get_text()      
            bachOccupation = bachOccupation.encode('utf8')
            
        numCol += 1
    
    #hard-code data for Bachelors who don't have own wiki page
    if ("Grant" in bachName):
        bachHometown = "London, UK"
        bachAge = "27"
    if ("Womack" in bachName):
        bachHometown = "Austin, Texas"
        bachAge = "37" 
    if ("Flajnik" in bachName):
        bachHometown = "Sonoma, California"
        bachAge = "28" 
    if ("Soules" in bachName):
        bachHometown = "Arlington, Iowa"
        bachAge = "33"
    if ("Palmer" in bachName):
        bachHometown = "Toronto, Ontario"
    
    #add info to bachelor dictionary
    bachelorDict['name'] = bachName  
    bachelorDict['age'] = bachAge
    bachelorDict['hometown'] = bachHometown
    bachelorDict['occupation'] = bachOccupation
    bachelorDict['elimination'] = "bachelor"
    bachelorDict['season'] = int(seasonNum.encode('utf8'))
    
    #add bachelor dictionary to list of all dicts for all people in all seasons
    listAllDicts.append(bachelorDict)         
    
    numRow +=1  #get next bachelor from wiki table
    
    
#Non-wiki Data sources:
#Grant: http://www.realitytvworld.com/news/matt-grant-dishes-on-upcoming-the-bachelor-london-calling-finale-7066.php
#Womack: http://www.people.com/people/article/0,,20429663,00.html
#Soules:http://www.people.com/article/chris-soules-new-bachelor-in-love

For earlier seasons, Wikipedia does not have the data - so we look to other websites to add in the data.

In [8]:
# Get data for Seasons 2, 4 and 6.
#
# Despite much effort, I could not get the text from the'realitytvword.com' sources below.
# The Beautiful Soup elements did not match the "inspect element" html tags.  
# (Oddly, I was able to scrape from 'realitytv.about.com' for season 8 - see below.)
#
# I tried the following suggestions, but they did not work:
#    https://www.reddit.com/r/learnpython/comments/2nqhzw/how_come_a_websites_page_source_html_is_different/
#    http://stackoverflow.com/questions/26913316/beautiful-soup-doesnt-get-full-webpage
# To keep on schedule, I hand-entered the data. If time permits, I will come back to this.
#
# SEASON 2 SOURCES
# 1) http://www.realitytvworld.com/#$$nxtmgs&&BYitVosLEeWjwgrBiYTF8Q$$
# 2) winner: http://www.realitywanted.com/shows/the-bachelor/season-2
#
# SEASON 4 SOURCES
# 1) http://www.realitytvworld.com/#$$nxtgih&&Dvr38or5EeW1VRL/9wgFGw$$
# 2) http://draheid.com/archives/bachelor4/messages/1452262/1105037.html
# 3) winner: http://www.realitywanted.com/shows/the-bachelor/season-4
#
# SEASON 6 SOURCES
# 1) Source:(looks different in Safari versus Chrome) http://www.realitytvworld.com/#$$nxt6je&&9hhYJIraEeWixQqIPWP/qw$$
# 2) Alt. Source: http://www.realitytvworld.com/news/abc-releases-identities-of-sixth-bachelor-edition-bachelorettes-2880.php")
# 3) FYI, source without occupations, but with pictures: 
# (http://community.realitytvworld.com/cgi-sys/cgiwrap/rtvw2/community/dcboard.cgi?az=printer_format&om=894&forum=DCForumID42)
# 4) Source of winner name: http://www.realitywanted.com/shows/the-bachelor/season-6
# 5) Mary Delgado source: http://www.sptimes.com/2003/09/26/Tampabay/No_wedding_bells__jus.shtml
#

#make array of data for contestants in Season 2
season2 = ["Heather, a 23-year-old sales coordinator who currently resides in Walnut Creek, CA",  
"Lori, a 26-year-old public relations representative who currently resides in Dallas, TX",  
"Heather, a 30-year-old flight attendant who currently resides in Watauga, TX",  
"Amber, a 25-year-old therapist who currently resides in Chapel Hill, NC",  
"Cari, a 28-year-old elementary school teacher who currently resides in Granite City, IL",  
"Christy, a 24-year-old radiologic technologist who currently resides in Avondale, AZ",  
"Hayley, a 28-year-old store manager who currently resides in Dana Point, CA",  
"Camille, a 29-year-old actress/model who currently resides in Los Angeles, CA",  
"Kyla Faye, a 22-year-old recording artist who currently resides in Midvale, UT",  
"Erin, a 25-year-old national magazine who currently resides in Chester, PA",  
"Frances, a 30-year-old strategic planning analyst who currently resides in San Francisco, CA",  
"Dana, a 24-year-old radio sales who currently resides in Beverly Hills, CA",  
"Merrilee, a 27-year-old teacher who currently resides in Forked River, NJ",  
"Suzi, a 27-year-old communications specialist who currently resides in Richmond, VA",  
"Anindita, a 27-year-old attorney who currently resides in New York, NY",  
"Fatima, a 22-year-old student who currently resides in Long Beach, CA",  
"Helene Eksterowicz, a 27-year-old school psychologist who currently resides in Glouchester, NJ",  
"Brooke Nicole, a 22-year-old student who currently resides in Tuscaloosa, AL",  
"Liangy, a 30-year-old paralegal who currently resides in Coral Gables, FL",  
"Erin, a 23-year-old interior designer who currently resides in Houston, TX",  
"Suzanne, a 32-year-old flight attendant who currently resides in Redondo Beach, CA",  
"Angela, a 26-year-old registered nurse who currently resides in Kansas City, MO",  
"Shannon, a 25-year-old graphic artist who currently resides in Hicksville, NY",  
"Christi Diane, a 23-year-old financial advisors asst. who currently resides in Eagle, ID",  
"Gwen, a 31-year-old executive recruiter who currently resides in Chester Springs, PA"] 


#make array of data for contestants in Season 4
season4= ["Brooke, a 24-year-old Teacher who currently resides in Bartlett, TN", 
"Lee-Ann, a 24-year-old Second Grade Teacher who currently resides in  Athens, GA", 
"Shea, a 25-year-old Firefighter who currently resides in Shreveport, LA", 
"Mary, a 35-year-old Sales Manager who currently resides in Tampa, FL", 
"Lindsay, a 23-year-old Professional Dancer who currently resides in Los Angeles, CA", 
"Estella Gardinier, a 27-year-old Mortgage Broker who currently resides in Beverly Hills, CA", 
"Lanah, a 27-year-old Event Coordinator who currently resides in Poolesville, MD", 
"Jenny, a 30-year-old Marketing Director who currently resides in Austin, TX", 
"Kristi, a 24-year-old Loan Processor who currently resides in Chicago, IL", 
"Lindsay, a 25-year-old Pharmaceutical Sales who currently resides in Mauldin, SC", 
"Shelly, a 26-year-old Pharmaceutical Sales who currently resides in Wanwatosa, WI", 
"Kelly Jo, a 23-year-old Director of Community Relations who currently resides in  Kalamazoo, MI", 
"Antoinette, a 30-year-old Senior Account Manager who currently resides in Philadelphia, PA", 
"Stacey, 26-year-old a Hair Stylist who currently resides in  Massillon, OH", 
"Heather, a 24-year-old Recent College Graduate who currently resides in Chicago, IL",
"Meredith, a 29-year-old Model/ Makeup Artist who currently resides in West Hollywood, CA", 
"Misty, a 23-year-old Radio Promotions Assistant who currently resides in Dallas, TX", 
"Christine, a 24-year-old Administrative Assistant who currently resides in Corona, CA", 
"Jenn, a 26-year-old Elementary School Teacher who currently resides in La Jolla, CA", 
"Leona, a 25-year-old Realtor's Assistant who currently resides in Chicago, IL", 
"Samantha, a 25-year-old Kitchen Designer who currently resides in Chicago, IL", 
"Julie, a 29-year-old Sales/ Modeling who currently resides in Louisville, KY", 
"Karin, a 32-year-old Mortgage Consultant who currently resides in Brooklyn Park, MN", 
"Lauren, a 24-year-old Retail Buyer who currently resides in Redondo Beach, CA", 
"Darla, a 26-year-old Attorney who currently resides in Gainesville, FL"] 

#make array of data for contestants in Season 6
season6 = ["Abby, a 29-year-old acrobat who currently resides in Henderson, NV", 
"Alma Rubenstein, a 35-year-old cafe owner who currently resides in Astoria, OR",
"Amanda, a 27-year-old cosmetics buyer who currently resides in New York, NY", 
"Amy, a 27-year-old marketing consultant who currently resides in San Diego, CA", 
"Andrea, a 33-year-old dental hygienist who currently resides in Denver, CO", 
"Ashley, a 31-year-old teacher who currently resides in Santa Barbara, CA", 
"Carolyn, a 36-year-old financial advisor who currently resides in Tulsa, OK", 
"Cheresse, a 31-year-old advertising director who currently resides in St. Louis, MO", 
"Cynthia, a 37-year-old charity foundations director who currently resides in Hermosa Beach, CA", 
"Elizabeth, a 28-year-old in pharmaceutical sales who currently resides in Chicago, IL", 
"Jayne, a 37-year-old dog groomer, who currently resides in Key Largo, FL",
"Jennifer, a 31-year-old account executive who currently resides in Seattle, WA", 
"Kelly, a 34-year-old actress who currently resides in Beverly Hills, CA", 
"Kerry, a 31-year-old nurse who currently resides in San Francisco, CA", 
"Kristie, a 32-year-old bar owner who currently resides in Windsor, Canada", 
"Kristin, a 27-year-old office manager who currently resides in Pensacola, FL", 
"Krysta, a 28-year-old financial analyst who currently resides in Oklahoma City, OK", 
"Leina, a 28-year-old advertising associate who currently resides in Chula Vista, CA", 
"Lisa, a 33-year-old teacher who currently resides in West Palm Beach, FL", 
"Melinda, a 39-year-old photographer who currently resides in Nashville, TN", 
"Natalie, a 34-year-old in retail sales who currently resides in Santa Monica, CA", 
"Nicole, a 28-year-old executive recruiter who currently resides in Libertyville, IL", 
"Susie, a 32-year-old insurance broker who currently resides in Hollywood, CA", 
"Tanya, a 31-year-old teacher who currently resides in Plano, Texas", 
"Wende, a 28-year-old model who currently resides in Austin, Texas",
"Mary Delgado, a 35-year-old real estate agent who currently resides in Tampa Bay, FL"]
In [9]:
# Add contestant data for seasons 2, 4 and 6 to dictionary 'contestantDict'.
#
# param : array of strings with contestant data
# param : season number
# param : winner name
def addNonWikiData(contestantArray, seasonNum, winnerName):
    for line in contestantArray:
        firstComma = line.index(',')                    #parse string
        startAge = line.index(" a ")
        jobTag = "year-old "     
        startJob = line.index(jobTag)
        homeTag = "currently resides in "
        startHome = line.index(homeTag)
        contestantDict = dict()                        #init new dict for contestant   
        contestantDict['name'] = line[:firstComma]     #put field data into dictionary
        contestantDict['age'] = line[startAge+3:startAge+5]
        contestantDict['hometown'] = line[startHome + len(homeTag):]
        contestantDict['occupation'] = line[startJob + len(jobTag):line.index("who")-1]
        contestantDict['season'] = seasonNum
    
        if (winnerName in line):                       #if this is the Winner,
            contestantDict['elimination'] = "Winner"   #add 'winner' to 'elimination' field
        else:
            contestantDict['elimination'] = "unknown"
        listOfContestantDicts.append(contestantDict)   #add dict to list of dicts for this season
        listAllDicts.append(contestantDict)           #add dict to list of all dicts in all seasons
           
    seasonsDict[seasonNum] = listOfContestantDicts     #key = season, val=list of contestant dicts
    
    
#add contestant data for seasons 4 and 6 to the contestant dictionary
addNonWikiData(season2, 2, "Eksterowicz")
addNonWikiData(season4, 4, "Gardinier")
addNonWikiData(season6, 6, "Delgado")
In [10]:
# Get data for Season 8, add to dictionary

#get site with season 8 contestants, make soup element
seasonEight = requests.get("http://realitytv.about.com/od/thebachelor8/ig/Ladies-of-The-Bachelor--Paris/")                #get site
season8= BeautifulSoup(seasonEight.text, "html.parser")

#get the table cell that has links to each episode
eight = season8.find("body", attrs={"id":"imagegalleryIndexPage"})
eight = eight.find("main", attrs={"id":"main"})
eight = eight.find("div", attrs={"class":"container"})
eight = eight.find_all("div", attrs={"class":"row"})[1]
eight = eight.find("div", attrs={"class":"col col-11"}).find("div", attrs={"class":"row"})
eight = eight.find("div", attrs={"class":"col col-8"})
eight = eight.find("div", attrs={"class":"content widget gallery-index-content"})
eight = eight.find("ul")

urls8 = []                       #list of urls for season 8 contestant pages
for item in eight.find_all("li", attrs={"itemtype":"http://schema.org/ImageObject"}):#for each contestant in list of season 8 contestants
    url8 = item.find("a")        #get url tag
    if url8 is not None:         #if has url link, get url 
        urls8.append("\"http://realitytv.about.com" + url8.get("href") + "\"")

#add contestant site leftover from next page
urls8.append("\"http://realitytv.about.com/od/thebachelor8/ig/Ladies-of-The-Bachelor--Paris/Shiloh-of-The-Bachelor--Paris.htm\"")  


cont8Sites = []                  #list of soup objects for season 8 contestant sites
for link in urls8:
    site8 = requests.get(link[1:-1]) 
    soup8 = BeautifulSoup(site8.text, "html.parser") #get soup element
    cont8Sites.append(soup8)     #add soup element to list     

for cont8 in cont8Sites:         #for each soup element (one per contestant site),
    c8 = cont8.find("body", attrs={"id":"imagegalleryPage"}) #find data
    c8 = c8.find("main", attrs={"class":"slab"})
    c8 = c8.find("div", attrs={"class":"container"})
    c8 = c8.find_all("div", attrs={"class":"row"})[1]
    c8 = c8.find("div", attrs={"class":"col col-11"})
    c8 = c8.find("div", attrs={"id":"contentIntro"})
    c8 = c8.find("div", attrs={"class":"row"})
    c8 = c8.find("div", attrs={"class":"col col-6"})
    c8 = c8.find("div", attrs={"class":"muted subheading"}).getText()
    
    contestantDict = dict()     #init new dict for contestant

    #get name
    firstComma = c8.index(',')
    contestantDict['name'] = c8[:firstComma]
        
    #get age
    substrC8 = c8[firstComma+2:]
    secondComma = substrC8.index(',')
    contestantDict['age'] = substrC8[:secondComma]
        
    #get hometown
    hometag = "resides in "
    if (hometag not in c8):
        hometag = "living in "
    homeIndex = c8.index(hometag)
    contestantDict['hometown'] = c8[(homeIndex+len(hometag)):-1]
        
    #get job
    jobtag = "is a "
    endjobtag = " who"
    if ("is an" in c8):
        jobtag = "is an "
    if("works in" in c8):   #has format "Tara, 23, works in X and currently resides in Y"
        jobtag = "works in "
        endjobtag = " and currently resides"
    if("is the" in c8):
        jobtag = "is the "
        endjobtag = " and currently resides"
    if (endjobtag not in c8):
        endjobtag = " living in"
    contestantDict['occupation'] = c8[(c8.index(jobtag)+len(jobtag)):(c8.index(endjobtag))]   #add name to dict

    #get elimination week
    if ("Sarah Stone" in name):         #hard-code season 8 winner
        contestantDict['elimination'] = "Winner"
    else:
        contestantDict['elimination'] = "unknown"
    
    #add season
    contestantDict['season'] = 8
        
    #add dict to list of dicts
    #if (contestantDict not in newList):
    listOfContestantDicts.append(contestantDict) #add dict to list of dicts in this season
    listAllDicts.append(contestantDict)  #add dict to list of all dicts in ALL seasons

seasonsDict[8] = listOfContestantDicts  #key = season num, val=list of contestant dicts
In [11]:
#Save to Disk

#import json
#fd = open("tempdata/seasonsDict.json", "w")   #save dictionary to disk
#json.dump(seasonsDict, fd)
#fd.close()

#del seasonsDict
#with open("tempdata/seasonsDict.json", "r") as fd: 
#    seasonsDict = json.load(fd)               #reload 
In [12]:
# Convert list of contestant dictionaries to a pandas dataframe.
#
# Note: 'listAllDicts' has seasons = [2,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19] 
# Here, we make a dataframe of Seasons 13 forward.
#

cDicts = []                  
for l in listAllDicts:
    if (l['season'] > 12):       #for seasons 13 forward,
        d={}
        d['name']=l['name']
        d['age']=l['age']
        d['hometown']=l['hometown']
        d['occupation']=l['occupation']
        d['elimination week']=l['elimination']  #for bachelors, value will be 'bachelor'
        if 'group_dates' in l and 'individual_dates' in l:
            d['group_dates'] = l['group_dates']
            d['individual_dates'] = l['individual_dates']
            d['roses_from_group_dates'] = l['roses_from_group_dates']
            d['roses_from_individual_dates'] = l['roses_from_individual_dates']
        else:
            d['group_dates'] = 0
            d['individual_dates'] = 0
            d['roses_from_group_dates'] = 0
            d['roses_from_individual_dates'] = 0
        d['season']=l['season']
        cDicts.append(d)
        
contestantDF = pd.DataFrame(cDicts)
contestantDF.drop_duplicates()  #drop duplicates, just in case
contestantDF.head(5)
Out[12]:
age elimination week group_dates hometown individual_dates name occupation roses_from_group_dates roses_from_individual_dates season
0 25 Winner 2 Dallas, Texas 1 Melissa Rycroft Sales Representative 0 1 13
1 24 Runner-up 1 Grand Rapids, Michigan 1 Molly Malaney Department Store Buyer 1 1 13
2 29 7 2 Peace River, Alberta 1 Jillian Harris Interior Designer 0 0 13
3 24 6 3 Carlsbad, California 0 Naomi Rose Crespo Flight Attendant 1 0 13
4 34 5 0 Huntsville, Alabama 2 Stephanie Hogan Single Mother & Medical Marketing Rep. 0 2 13
In [18]:
import json
with open('competition_data.json', 'w') as fp:
    json.dump(weeklyCompData, fp)
In [ ]: