Twitter Sentiment Analysis

The following code takes the libraries of data scraped from twitter and converts them into predictors that will be used for the future regression. We are interested in two types of predictors from the twitter data:

1) The share of tweets a contestant received for one episode.

2) The general positivity of a contestant's tweets for one episode.

In [28]:
%matplotlib inline

import oauth2
import simplejson
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests
import datetime
import json
import unittest, time, re
from pattern.en import parse
from pattern.en import pprint
from pattern.vector import stem, PORTER, LEMMA
from sklearn.feature_extraction import text
import nltk
import itertools
In [2]:
#Load Contestant Name Data from wiki scrape
with open("tempdata/seasonsDict.json") as json_file:
    wiki_data = json.load(json_file)

#Fix known formatting problems:
wiki_data['19'][19]['eliminated'] = u'Eliminated in week 2'
wiki_data['19'][20]['eliminated'] = u'Eliminated in week 1'

w19 = []
for ww in wiki_data['19'][0:29]:
    w19.append(ww)
    
wiki_data['19'] = w19

#Load date guide
date_guide = pd.read_csv("date_guide.csv")
In [3]:
#Get all contestant names
cont_nam = []
for wkey in wiki_data.keys():
    for person in wiki_data[wkey]:
        cont_nam.append(person['name'])

#Strip names with weird formatting
def url_strip(r):
    if bool(re.search("href", r)):
        oval = r.split(">")[1].replace("</a","")
    else:
        oval = r
    if bool(re.search("u\"", oval)):
        return oval.replace("u\"","").replace("[","")
    else:
        return oval
    
full_names = map(url_strip, cont_nam)

#Get just first names
first_names = set(map(lambda r: r.split(" ")[0], full_names))

bach_names = ['Jason', 'Jake', 'Brad', 'Ben', 'Sean', 'Juan', 'Chris']

Create Corpus

Here we create a corpus of adjectives and adverbs from our entire body of tweets. Since Twitter is not copy-edited, we have to intensely filter our results to collect the words.

In [4]:
#First load all the tweets from all seasons
all_tweets = []
for iseason in range(12,20):
    file_name = "tweets" + str(iseason) + ".json"
    with open(file_name) as json_file:
        tdat = json.load(json_file)
    
    for tkey in tdat.keys():
        cont_dat = tdat[tkey]
        if cont_dat is not None:
            for cc in cont_dat:
                ep_dat = cc.keys()
                for tweet in cc[ep_dat[0]]:
                    all_tweets.append(tweet)
In [5]:
#Next flatten all the tweets into one list - where one sentence = one list entry

#Get all twitter sentences over all tweets   
#Flatten all sentences into an array
tweet_periods = map(lambda r: r.split("."), all_tweets)
tweet_flat1 = [item for sublist in tweet_periods for item in sublist]

tweet_questions = map(lambda r: r.split("?"), tweet_flat1)
tweet_flat2 = [item for sublist in tweet_questions for item in sublist]

tweet_exclaim = map(lambda r: r.split("!"), tweet_flat2)
tweet_flat3 = [item for sublist in tweet_exclaim for item in sublist]

#Filter out empty sentences
tweet_sentences = filter(lambda r: r not in "", tweet_flat3)

#Replace hypens as spaces
tweet_sentences = map(lambda r: r.replace("-"," "), tweet_sentences)
tweet_sentences = map(lambda r: r.replace("="," "), tweet_sentences)

#Strip weird characters from words
tweet_encode = [tt.encode("ascii", "ignore") for tt in tweet_sentences]
tweet_process_output = map(lambda r: r.translate(None,"*@#\/[]()"), tweet_encode)

#Filter out strange results in our vocabulary
good_sentences = filter(lambda r: np.logical_not(bool(re.search("\/",r))) & \
                  np.logical_not(bool(re.search("\\\\",r))) & \
                  np.logical_not(bool(re.search("http",r))) & \
                  np.logical_not(bool(re.search("www",r)))  & \
                  np.logical_not(bool(re.search("comnode",r))) & \
                  np.logical_not(bool(re.search("utm_",r))) & \
                  np.logical_not(bool(re.search("v=",r))) & \
                  np.logical_not(bool(re.search("lyw",r)))
                  , tweet_process_output)
In [6]:
%%time
#Use the NLTK package to tokenize each word in each sentence
#Collect only adjectives and adverbs
all_adj = []
for sentence in good_sentences:
    stokens = nltk.word_tokenize(sentence)
    for word, part_of_speech in nltk.pos_tag(stokens):
        if part_of_speech in ['JJ', 'JJS', 'JJR', 'RB', 'RBR', 'RBS', 'RP']:
            all_adj.append(word)
CPU times: user 15min 12s, sys: 6.49 s, total: 15min 19s
Wall time: 15min 32s
In [7]:
#Remove stop words
full_corpus = filter(lambda r: r not in nltk.corpus.stopwords.words("english"), all_adj)

#Remove words that are contestant names
full_corpus = filter(lambda r: r not in first_names, full_corpus)
full_corpus = filter(lambda r: r not in bach_names, full_corpus)

#Get unique values
our_corpus = set(full_corpus)
len(our_corpus)
Out[7]:
4146

Find sentiment of words in our corpus

We use the website http://text-processing.com/, which is a library that gives the probability that a word is either positive or negative from a fit of words to movie and twitter data.

In [11]:
#To not make the text-processing API mad, we will break up our data into 5 sections
#text-processing.org throttles API requests to 1000 per day
corpus1 = list(our_corpus)[0:900]
corpus2 = list(our_corpus)[901:1800]
corpus3 = list(our_corpus)[1801:2700]
corpus4 = list(our_corpus)[2701:3600]
corpus5 = list(our_corpus)[3601:-1]
In [14]:
#Create function that finds the positive probability of word
def corpus_prob(which_word):

    api_url = "http://text-processing.com/api/sentiment/"

    data_type = {"text": which_word}
    request_val = requests.post(api_url, data = data_type)

    return json.loads(request_val.text)

#Make function to find probabilities of a corpus
#Returns dictionary of corpus words and probabilities
def prob_of_corpus(which_corpus, corpus_num):

    #Run on CORPUS1
    json_list1 = []
    for wword in which_corpus:
        time.sleep(1)
        json_list1.append(corpus_prob(wword))

    #Remove weird keys
    json_list1_good = []
    good_corpus1 = []

    for corp, dicts in zip(which_corpus, json_list1):
        try: 
            good_corpus1.append(corp.encode('utf-8'))
            json_list1_good.append(dicts)
        except:
            ""
    probs1 = dict(zip(good_corpus1, json_list1_good))

    with open('probs'+str(corpus_num)+'.json', 'w') as fp:
        json.dump(probs1, fp)
    
    print "corpus ", corpus_num, " done"

We retrieve probabilities from text-processing.com API. I do these in different cells so that I can change my IP address before running on each sub-corpus.

In [15]:
%%time
prob_of_corpus(corpus1, 1)
corpus  1  done
CPU times: user 3.4 s, sys: 560 ms, total: 3.96 s
Wall time: 17min 55s
In [16]:
%%time
prob_of_corpus(corpus2, 2)
corpus  2  done
CPU times: user 3.23 s, sys: 446 ms, total: 3.68 s
Wall time: 19min 15s
In [17]:
%%time
prob_of_corpus(corpus3, 3)
corpus  3  done
CPU times: user 3.18 s, sys: 498 ms, total: 3.68 s
Wall time: 18min 32s
In [18]:
%%time
prob_of_corpus(corpus4, 4)
corpus  4  done
CPU times: user 3.32 s, sys: 482 ms, total: 3.8 s
Wall time: 18min
In [19]:
%%time
prob_of_corpus(corpus5, 5)
corpus  5  done
CPU times: user 1.97 s, sys: 301 ms, total: 2.27 s
Wall time: 10min 49s

Create dictionaries for each contestant for each episode

We want a dictionary that is keyed by season, contestant, and episode date. The values are then the share of tweets & a positivity index.

In [20]:
#Read all dictionaries back in to get full corpus
with open("probs1.json") as json_file:
    probs1 = json.load(json_file)
with open("probs2.json") as json_file:
    probs2 = json.load(json_file)
with open("probs3.json") as json_file:
    probs3 = json.load(json_file)
with open("probs4.json") as json_file:
    probs4 = json.load(json_file)
with open("probs5.json") as json_file:
    probs5 = json.load(json_file)

corpus = {}
corpus.update(probs1)
corpus.update(probs2)
corpus.update(probs3)
corpus.update(probs4)
corpus.update(probs5)
In [24]:
#Similar to full corpus tweet processing - make function to get single tweet into manageable format
def tweet_process(tweet):
    #Split sentences up
    tweet_periods = tweet.split(".")
    tweet_questions = [tp.split("?") for tp in tweet_periods]
    tweet_flat1 = [item for sublist in tweet_questions for item in sublist]
    tweet_exclaim = [tq.split("!") for tq in tweet_flat1]
    tweet_flat2 = [item for sublist in tweet_exclaim for item in sublist]

    #Replace hypens as spaces
    tweet_sentences1 = [tf.replace("-"," ") for tf in tweet_flat2]
    return [tf.replace("="," ") for tf in tweet_sentences1]

#Similar to full corpus, use a tweet to find all adjectives + adverbs for that tweet
def tweet_part_of_speech(tweet_process_output):
    
    #Strip weird characters from words
    tweet_encode = [tt.encode("ascii", "ignore") for tt in tweet_process_output]
    tweet_process_output = map(lambda r: r.translate(None,"*@#\/[]()"), tweet_encode)
        
    #Get all adjectives from tweet
    all_adj = []
    for sentence in tweet_process_output:
        stokens = nltk.word_tokenize(sentence)
        for word, part_of_speech in nltk.pos_tag(stokens):
            if part_of_speech in ['JJ', 'JJS', 'JJR', 'RB', 'RBR', 'RBS', 'RP']:
                all_adj.append(word)

    good_adj = filter(lambda r: np.logical_not(bool(re.search("\/",r))) & \
                      np.logical_not(bool(re.search("\\\\",r))) & \
                      np.logical_not(bool(re.search("http",r))) & \
                      np.logical_not(bool(re.search("www",r)))  & \
                      np.logical_not(bool(re.search("comnode",r))) & \
                      np.logical_not(bool(re.search("utm_",r))) & \
                      np.logical_not(bool(re.search("v=",r))) & \
                      np.logical_not(bool(re.search("lyw",r)))
                      , all_adj)  
    return good_adj

#Make probability function that takes the output of tweet_part_of_speech and finds 
#the probability that the tweet is positive.
#This essentially boils down to finding the mean positive probability of the tweet based on the 
#adjectives/adverbs. If above 50% - consider positive, otherwise negative.
def is_tweet_positive(tweet_pos_output):
    probs = []
    for word in tweet_pos_output:
        try:
            word_prob = corpus[word]['label']
            if word_prob == "neg":
                probs.append(-1)
            elif word_prob == "pos":
                probs.append(1)
            else:
                probs.append(0)
                
        except:
            ""
    if len(probs) == 0:
        return 0
    else:
        return np.mean(probs)
        
In [25]:
#Create function that takes season, gets tweets for each episode/contestant
def tweets_by_season(use_season):
    with open("tweets"+str(use_season)+".json") as json_file:
        tdat = json.load(json_file)

    season_dates = date_guide[date_guide.Season == use_season]
    season_dict={}
    contestants = tdat.keys()

    for contestant in contestants:
        contestant_dict = {}
        cont_dat = tdat[contestant]
        if cont_dat is not None:
            for cc in cont_dat:
                episode_date = cc.keys()
                number_of_tweets = 0
                positive_index = 0
                for tweet in cc[episode_date[0]]:
                    number_of_tweets += 1
                    positive_index += is_tweet_positive(tweet_part_of_speech(tweet_process(tweet)))
                if number_of_tweets == 0:
                    sentiment = 0
                else:
                    sentiment = float(positive_index) / float(number_of_tweets)

                episode_dict = {episode_date[0]: {"ntweet": number_of_tweets, "sentiment":sentiment}}
                contestant_dict.update(episode_dict)
        season_dict.update({contestant: contestant_dict})
    return season_dict               
    
            
In [26]:
%%time

#Put all tweets together to form one large dictionary
season_nums = range(13,20)
tweet_dict = {}
for season_num in season_nums:
    dseason = tweets_by_season(season_num)
    tweet_dict.update({season_num : dseason})
    print season_num, "is done"
    
with open('twitter_sentiment.json', 'w') as fp:
    json.dump(tweet_dict, fp)
13 is done
14 is done
15 is done
16 is done
17 is done
18 is done
19 is done
CPU times: user 13min 47s, sys: 5.63 s, total: 13min 53s
Wall time: 13min 52s

Visualization

Here we make a visualization of percentage of tweets a contestant received per episode

In [4]:
with open("twitter_sentiment.json") as json_file:
    twitter = json.load(json_file)
    
date_guide = pd.read_csv("date_guide.csv")
In [143]:
#Show for each season, how tweets vary by episode

def plot_tweets_by_season(use_season):
    season_twitter = twitter[str(use_season)]

    all_names = season_twitter.keys()
    all_episodes = date_guide[date_guide["Season"] == use_season]
    ntweet_array = np.zeros(all_episodes.shape[0])
    
    #Get number of tweets per episode per contestant
    all_tweets = []
    for i, cname in enumerate(all_names):
        if not (not season_twitter[cname].values()):
            c_episodes = season_twitter[cname].keys()
            for episode in c_episodes:
                ntweet_array[np.where(episode == all_episodes['Date'])[0][0]]=season_twitter[cname][episode]["ntweet"]
        all_tweets.append(ntweet_array)
        ntweet_array = np.zeros(all_episodes.shape[0])
                
    #Get total tweets
    total_tweets = []
    un_tweeters = []
    for i, ep in enumerate(all_episodes['Episode']):
        total_tweets.append(np.sum([qq[i] for qq in all_tweets]))
        un_tweeters.append()
    
    #Plot share of tweets
    colors = iter(sns.color_palette("hls", len(all_names)))

    for i, cname in enumerate(all_names):
        plot_array = [all_tweets[i][q] / float(total_tweets[q]) for q in range(len(all_episodes['Episode']))]
        plot_array = [np.nan if pp == 0 else pp for pp in plot_array]  
        plt.plot(all_episodes['Episode']-1, plot_array, color=next(colors), ls="-", marker="o", label=all_names[i])
    plt.xlim([1,9])

    plt.legend(bbox_to_anchor=(1.2,1), fontsize=13)
    plt.title("Season " + str(use_season) + " Tweets per Episode")
    plt.xlabel("Episode")
    plt.ylabel("Number of Tweets")
    plt.show()
In [149]:
plot_tweets_by_season(15)
In [150]:
plot_tweets_by_season(16)
In [151]:
plot_tweets_by_season(17)
In [152]:
plot_tweets_by_season(18)
In [153]:
plot_tweets_by_season(19)

We see a consistent feature in our data, specifically it appears that contestants who go far in the competition generally have a large share of tweets early in the season.

In [ ]: