Here we present two ways to scrape data from Twitter. The first is through the Twitter API. The trouble with the API method is that Twitter only provides data 7-10 old through it's API, rendering our analysis useless for old seasons which go back to 2009.
The second method we show and use directly queries the Twitter website, dynamically scrolls through each Twitter search using a ghost chrome browser, and then scrapes the HTML. We then process the HTML through Beautiful Soup.
%matplotlib inline
import oauth2
from twython import Twython
import simplejson
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests
import datetime
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import sys
import unittest, time, re
The API Method is good, but only gives us very recent twitter data. Below is an example of the type of code we would use to interact with the API
#APP_KEY = "qtmevmQ18N1vyWTAXfxqmh4oN"
#APP_SECRET = "MdZibormo3teZPTfMyeLEcuzMURHYidArOml0GtOQyrl6dI13R"
#access_token = '2694571580-Y8DsMjB0iMTGmm3Pwpo6IL3enhhFdAZQSXDIxO8'
#access_secret = 'AYciwyU197r6adpNziDT8pB0tmT3bKIihMrx7SPfbofRO'
#twitter = Twython(APP_KEY, APP_SECRET, access_token, access_secret)
#search_results = oauth_req('https://api.twitter.com/1.1/statuses/home_timeline.json', \
# access_token, access_secret)
#for tweet in search_results["statuses"]:
# print tweet["text"]
#Define Twitter GET function using OAUTH2
#Function from https://dev.twitter.com/oauth/overview/single-user
#def oauth_req(url, key, secret, http_method="GET", post_body="", http_headers=None):
# consumer = oauth2.Consumer(key=APP_KEY, secret=APP_SECRET)
# token = oauth2.Token(key=key, secret=secret)
# client = oauth2.Client(consumer, token)
# resp, content = client.request( url, method=http_method, body=post_body, headers=http_headers )
# return content
Manual Scraping of Twitter presents two challenges:
1) Twitter uses JavaScript for interactive webpage scrolling. If a search produces multiple results, once a reader gets to the end of a page, instead of being prompted with a "Next Page" link, twitter automatically queries it's JSON backend and dynamically loads the page.
To work around this issue, we use the package Selenium which mimics "scrolling" the webpage for us. After scrolling though a set number of pages, we extract the HTML from the page, as suffient XHR requests have been made by Twitter.
2) Manual page data is not in nice JSON format, so we must use html parsing to get at the data.
Since we are interested in the positive/negative vibes of a tweet, we use Twitter's sentiment analysis in our search queries for a particular contestant. Then all we need to do is count the number of tweet tags that we scraped.
#We borrow heavily from http://stackoverflow.com/questions/12519074/scrape-websites-with-infinite-scrolling
def scrape_page(since, until, contestant, \
base_url="https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor", \
pages_to_scroll=3, ):
#### Initiate Chrome Browser #######
#Must download ChromeDriver executable from https://sites.google.com/a/chromium.org/chromedriver/downloads
driver = webdriver.Chrome('/Users/dcusworth/chrome_driver/chromedriver') #Specify location of driver
driver.implicitly_wait(30)
verificationErrors = []
accept_next_alert = True
#Create URL that will get the text
ender = "&src=typd"
#Use Twitter Sentiment Analysis - REMOVED as it may be underestimating tweets
#if is_happy:
# sentiment = "%20%3A)"
#else:
# sentiment = "%20%3A("
since_time = "%20since%3A" + str(since)
until_time = "%20until%3A" + str(until)
contestant_name = "%20" + contestant
final_url = base_url + contestant_name + since_time + until_time + ender
#print final_url
#Jump onto the webpage and scroll down
delay = 3
driver.get(final_url)
for i in range(1,pages_to_scroll):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(4)
html_source = driver.page_source
#After scrolling enough times, get the text of the page
data = html_source.encode('utf-8')
driver.quit()
return data
We load in scraped Wikipedia Data that gives us a contestant's name and dates they appeared on the Bachelor. For each season/contestant pair, we create a dataframe of episode date, positive tweets, and negative tweets.
#Load Contestant Name Data from wiki scrape
with open("tempdata/seasonsDict.json") as json_file:
wiki_data = json.load(json_file)
#Fix known formatting problems:
wiki_data['19'][19]['eliminated'] = u'Eliminated in week 2'
wiki_data['19'][20]['eliminated'] = u'Eliminated in week 1'
w19 = []
for ww in wiki_data['19'][0:29]:
w19.append(ww)
wiki_data['19'] = w19
#Scrape Web to find the airdates of each episode
#Use http://epguides.com/Bachelor/
sdat = requests.get("http://epguides.com/Bachelor/")
#Parse through Beautriful Soup
ssoup = BeautifulSoup(sdat.text, "html.parser")
#Get all episode text in rows
row_text = ssoup.find_all("pre")[0]
uurls = []
ep_nam = []
for r in row_text.find_all("a"):
if "Week" in r.get_text():
uurls.append(r.get("href"))
ep_nam.append(r.get_text())
#Fix Season 19 episode problems
ep_nam[140:] = [ee + " (S19)" for ee in ep_nam[140:]]
good_dates = []
for uurl in uurls:
time.sleep(1)
#Open up subpages
subpage = requests.get(uurl)
soup2 = BeautifulSoup(subpage.text, "html.parser")
#Find box with date in it
pars = soup2.find_all("br")
pp = pars[0].get_text().split()
pind = ["Airdate" in d for d in pp]
#Convert date from page into usable date
date_string = "-".join(pp[np.where(pind)[0]+1: np.where(pind)[0]+4])
date_string = re.sub(",", "",date_string)
date_object = datetime.datetime.strptime(date_string, "%b-%d-%Y")
good_dates.append(date_object.strftime("%Y-%m-%d"))
#Extract the Season Number
season_num = []
for ee in ep_nam:
start_string = ee.find("(")
season_num.append(int(ee[(start_string+2):(len(ee)-1)]))
#Count up Episode Numbers
ep_num = []
start_val = 0
season_start = 1
for i in range(len(season_num)-1):
if season_num[i] == season_start:
start_val += 1
ep_num.append(start_val)
else:
season_start += 1
start_val = 1
ep_num.append(start_val)
ep_num.append(ep_num[-1] + 1)
#Put Season / Episodes / Dates into a Pandas Dataframe
date_guide = pd.concat([pd.Series(season_num, name="Season"), pd.Series(ep_num, name="Episode"), \
pd.Series(good_dates,name="Date")], axis=1)
#Save as CSV for other scripts
date_guide.to_csv("date_guide.csv")
#Use Date Guide + Wiki info to set up inputs to scrape_page
#For a given Season, get all contestant names
#For each contestant find how many episodes they were on (minus their elimination episode)
#For each episode, count positive / negative tweets they received
#Output a dictionary with the Season as Key, and a dictionary of of each contestant's pos/neg splits as values
def scrape_season_tweets(season):
season_dat = wiki_data[str(season)]
all_eps = date_guide[date_guide.Season == season]
result_dict = {}
for sd in season_dat:
#Get contestant's name
cnam = sd["name"]
if len(cnam.split(">")) > 1:
cnam2 = cnam.split(">")[1]
contestant = cnam2.encode("utf-8").split(" ")[0]
else:
contestant = cnam.encode("utf-8").split(" ")[0]
for ch in ["[", "]", "u\"","<",">"]:
contestant = contestant.replace(ch, "")
print contestant
#Find week they are elminated, and then select weeks to run scraper
elim = sd['eliminated']
if ("Win" in elim) | ("Run" in elim):
elim_week = all_eps.shape[0] - 1
eweek = all_eps.iloc[0:elim_week]
use_dats = eweek["Date"]
else:
elim_week = int(elim[(len(elim)-1):len(elim)]) - 1
eweek = all_eps.iloc[0:elim_week]
use_dats = eweek["Date"]
dats = [datetime.datetime.strptime(idate, '%Y-%m-%d') for idate in use_dats]
#For each date, run scraper, save in dictionary
ep_dict = []
if len(dats)==0 | ("href" in contestant):
result_dict[contestant] = None
else:
for run_date in dats:
#Make time range
start_time = run_date + datetime.timedelta(days=-1)
end_time = run_date + datetime.timedelta(days=2)
#Collect all tweets
tweet_page = scrape_page(since=start_time.strftime('%Y-%m-%d'), until=end_time.strftime('%Y-%m-%d'), \
contestant=contestant, pages_to_scroll=10)
soup = BeautifulSoup(tweet_page, "html.parser")
user_tweets = soup.find_all("p", attrs={"class": "TweetTextSize"})
each_tweet = [uu.get_text() for uu in user_tweets]
#FOLLOWING CODE if doing Twitter-built-in sentiment analysis
#Find all positive tweets
#happy_time = scrape_page(since=start_time.strftime('%Y-%m-%d'), until=end_time.strftime('%Y-%m-%d'), \
# is_happy=True, contestant=contestant)
#soup = BeautifulSoup(happy_time, "html.parser")
#happy_tweets = len(soup.find_all("p", attrs={"class": "TweetTextSize"}))
#Find all sad tweets
#sad_time = scrape_page(since=start_time.strftime('%Y-%m-%d'), until=end_time.strftime('%Y-%m-%d'), \
# is_happy=False, contestant=contestant)
#soup = BeautifulSoup(sad_time, "html.parser")
#sad_tweets = len(soup.find_all("p", attrs={"class": "TweetTextSize"}))
print run_date.strftime('%Y-%m-%d')
#Save the results to a dictionary
ep_dict.append({run_date.strftime('%Y-%m-%d'):each_tweet})
result_dict[contestant] = ep_dict
return result_dict
tweets13 = scrape_season_tweets(13)
with open('tweets13.json', 'w') as fp:
json.dump(tweets13, fp)
tweets14 = scrape_season_tweets(14)
with open('tweets14.json', 'w') as fp:
json.dump(tweets14, fp)
tweets15 = scrape_season_tweets(15)
with open('tweets15.json', 'w') as fp:
json.dump(tweets15, fp)
tweets16 = scrape_season_tweets(16)
with open('tweets16.json', 'w') as fp:
json.dump(tweets16, fp)
tweets17 = scrape_season_tweets(17)
with open('tweets17.json', 'w') as fp:
json.dump(tweets17, fp)
tweets18 = scrape_season_tweets(18)
with open('tweets18.json', 'w') as fp:
json.dump(tweets18, fp)
tweets19 = scrape_season_tweets(19)
with open('tweets19.json', 'w') as fp:
json.dump(tweets19, fp)
tweets12 = scrape_season_tweets(12)
with open('tweets12.json', 'w') as fp:
json.dump(tweets12, fp)