Cusworth, Heal, Leinicke, Rodriguez.
We obtain (scrape) photos of most contestants from Seasons 13-19 of The Bachelor. We register (align) the grayscale images, crop them to a small region containing only the face and face-framing hair, then perform PCA on those cropped images. We then examine the contestants' varying levels of success as represented by the projection of the images onto the first two principal components. We see that in this PC-1&2 space, the 'winners' and 'runners up' classes can be separated by a linear classifier.
Due to the high dimensionality of this data set, we would give this notebook about 2 hours to run.
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests
This 'sirlinksalot' seemed like a really promising place to find photos. It pointed us in the direction of other good sites too!
linkVec = []
for thisPage in [3]:#[2,3,4,5]:
allLinkPage=requests.get("http://www.sirlinksalot.net/archives/thebachelor%d.html" % (thisPage))
soup = BeautifulSoup(allLinkPage.text, "html.parser")
rows = soup.find_all("a")
for row in rows:
rowString = str(row)
checkASCII = [ord(letter) for letter in rowString]
if max(checkASCII)<127 and min(checkASCII)>9:
thisLink = str(row.get("href"))
if "sirlinksalot" not in thisLink: #remove internal links
if "clickbank.net" not in thisLink: #remove ad links
if "pub43" not in thisLink: #remove ad links
if "bilbo" not in thisLink: #remove ad links
if "fastclick" not in thisLink: #remove ad links
if "casalemedia" not in thisLink: #remove ad links
linkVec.append(thisLink)
print len(linkVec)
#sort our vector to group similar website roots
linkVec = sorted(linkVec[0:20])
archivedLinkVec = []
count = 0;
length = len(linkVec)
for thisLink in linkVec:
allLinkPage=requests.get("https://web.archive.org/web/*/" + thisLink)
soup = BeautifulSoup(allLinkPage.text, "html.parser")
if soup.find_all("div", attrs={"class": "date captures"}):
row = soup.find_all("div", attrs={"class": "date captures"})[0]
row = row.find("a").get("href")
archivedLinkVec.append(str("https://web.archive.org" + row))
count = count+1
#print count, " out of ", length, " done"
#we can assume that each article from the Examiner website will have similar HTML formatting.
#So let us also note the root page.
rootPage = []
for link in archivedLinkVec:
first = 50#link.find("www")
last = link.find(".com")+4
rootPage.append(link[first:last])
#collect into a data frame
linkDF = pd.DataFrame()
linkDF['archived_link']=archivedLinkVec
linkDF['root_page']= rootPage
myGroups = linkDF.groupby('root_page')
count=0
for group in linkDF.root_page.unique():
if group:
count = count+1
print linkDF.root_page.unique()
print "There are",count, "unique groups." #this shows that the number of unique websites
#whose formatting we need to learn is pretty limited.
#SEASON 13 BLUE BACKGROUND
season13Names =[]
season13PicLinks=[]
season13Link = "http://www.buddytv.com/slideshows/meet-the-cast-of-the-bachelor-season-13.aspx"
testPage=requests.get(season13Link)
soup = BeautifulSoup(testPage.text, "html.parser")
scripts = soup.find_all("script")
for script in scripts:
if script.string:
if "Slideshow.SlideshowNavigator" in str(script.string):
mystring = str(script.string)
myvec = mystring.split("{")
myvec = myvec[7:32]
for k in myvec:
fromhere = str(k).find("Title")+8
tohere = str(k).find("Blurb")-3
season13Names.append(k[fromhere:tohere])
fromhere = str(k).find("ImageId")+9
tohere = str(k).find("PhotoCredit")-2
season13PicLinks.append("http://images.buddytv.com/btv_2_%d_1_590_-1_0_/meet-the-cast-of--th.jpg" % int(k[fromhere:tohere]))
len(season13Names), len(season13PicLinks)
#### SEASON 14 BLUE BACKGROUND
season14Link = "http://www.realitytea.com/2009/12/17/meet-the-25-bachelorettes-of-the-bachelor-14-on-the-wings-of-love-photos/"
season14PicLinks = []
season14Names = []
testPage=requests.get(season14Link)
soup = BeautifulSoup(testPage.text, "html.parser")
rows = soup.find_all("img")
for row in rows:
if row.get("src"):
if "the-bachelor-14" in row.get("src"):
season14PicLinks.append(str(row.get("src")))
myname = str(row.get("alt"))
myname = myname.split("14 ")[1]
season14Names.append(myname)
len(season14Names),len(season14PicLinks)
#season14Names = ["Alexa Mcallister","Ali Fedotowsky","Ashleigh Hunt","Ashley Almore","Caitlyn McCabe","Channy Choch","Christina McCasland","Corrie","Elizabeth Kitt","Elizabeth Kreft","Ella Nolan","Emily Harkins","Gia Alliemand","Jessie Sulidis","Kathryn","Kimberly","Kirsten","Michelle Kujawa","Rozlyn Papa","Sheila","Stephanie","Tenley","Tiana","Valishia Savage","Vienna Girardi"]
#SEASON 15 BLUE BACKGROUND
season15Link = "http://www.realitytea.com/2010/12/21/photos-meet-the-25-bachelorettes-of-brad-womacks-the-bachelor-15/"
season15PicLinks = []
season15Names=[]
testPage=requests.get(season15Link)
soup = BeautifulSoup(testPage.text, "html.parser")
rows = soup.find_all("img")
for row in rows:
if row.get("src"):
if "the-bachelor-15" in row.get("src"):
season15PicLinks.append(str(row.get("src")))
season15Names.append(str(row.get("alt")))
len(season15Names),len(season15PicLinks)
#SEASON 16 BLUE BACKGROUND
season16Link = "http://www.realitytea.com/2011/12/12/photos-%E2%80%93-meet-the-25-bachelorettes-of-ben-flajnik%E2%80%99s-the-bachelor-season-16/"
season16PicLinks = []
season16Names = []
testPage=requests.get(season16Link)
soup = BeautifulSoup(testPage.text, "html.parser")
rows = soup.find_all("img")
for row in rows:
if row.get("data-lazy-src"):
if "The_Bachelor_16_" in row.get("data-lazy-src"):
season16PicLinks.append(str(row.get("data-lazy-src")))
season16Names.append(str(row.get("alt")))
len(season16Names),len(season16PicLinks)
#SEASON 17 BLUE BACKGROUND
season17Link = "http://www.realitytea.com/2012/09/28/photos-meet-the-26-bachelorettes-of-sean-lowes-the-bachelor-season-17/"
season17PicLinks = []
season17Names = []
testPage=requests.get(season17Link)
soup = BeautifulSoup(testPage.text, "html.parser")
rows = soup.find_all("img")
for row in rows:
if row.get("data-lazy-src"):
if "-sean-lowe-season" in row.get("data-lazy-src"):
season17PicLinks.append(str(row.get("data-lazy-src")))
season17Names.append(str(row.get("title")))
len(season17Names),len(season17PicLinks)
#SEASON 18 BLUE BACKGROUND
season18Links=["http://www.usmagazine.com/entertainment/pictures/the-bachelor-season-18-meet-juan-pablos-bachelorettes-2013412/343%d" % k for k in range(71,98)]
season18PicLinks = []
season18Names = []
for link in season18Links:
testPage=requests.get(link)
soup = BeautifulSoup(testPage.text, "html.parser")
rows = soup.find_all("figure")
for row in rows:
if row.get("data-slug"):
if "18-meet-juan-pablos" in row.get("data-slug"):
thisimg = row.find_all("img")[0]
thisimg = thisimg.get("src")
season18PicLinks.append("http:" + thisimg)
season18Names.append(str(row.get("data-title")))
len(season18Names),len(season18PicLinks)
#SEASON 19 BLUE BACKGROUND
season19Links = "http://www.accesshollywood.com/galleries/meet-bachelor-chris-soules-30-farm-girls-in-training-4692"
season19PicLinks = []
season19Names =[]
testPage=requests.get(season19Links)
soup = BeautifulSoup(testPage.text, "html.parser")
rows = soup.find_all("img")
for row in rows:
if row.get("alt"):
if "starring Chris Soules" in row.get("alt"):
season19PicLinks.append(str(row.get("src")))
b = row.get("alt")
season19Names.append(b.split()[0])
len(season19Names),len(season19PicLinks)
#AGGREGATE PICS AND NAMES
allPicLinks = np.concatenate([season13PicLinks,season14PicLinks,season15PicLinks,season16PicLinks,season17PicLinks,season18PicLinks,season19PicLinks])
allNames = np.concatenate([season13Names,season14Names,season15Names,season16Names,season17Names,season18Names,season19Names])
len(allPicLinks),len(allNames)
#might also be helpful: http://www.buddytv.com/tvshow/page/the-bachelor-cast-1.aspx
import PIL
from PIL import Image
Uncomment this block if running for the first time.
## SAVE ALL PICS TO LOCAL MACHINE
## only really need to do this once
## make sure you're in the right folder on your local machine
# import urllib
# count = 0
# for link in allPicLinks:
# urllib.urlretrieve(link, "bachelorette_pics/bachelorette%d.jpg" % count)
# count = count+1
Read from the saved file on your local machine.
picsAsMatrices = []
badImages = [5,11,20,25,105]
old_allNames = allNames
print len(old_allNames)
allNames = []
viableIndices = np.concatenate([range(0,5),range(6,11),range(12,20),range(21,25),range(26,105),range(106,187)])
for count in viableIndices:
picsAsMatrices.append(sp.misc.imread("bachelorette_pics/bachelorette%d.jpg" % count))
allNames.append(old_allNames[count])
picsAsBinaryMatrices = []
#turn pics into B&W, then into (binary) numpy arrays
for count in viableIndices:
img0 = Image.open("bachelorette_pics/bachelorette%d.jpg" % count).convert('LA')
img0.save("greyscale_pics/greyscale%d.png" % count)
img1 = np.array(img0.getdata())
img2 = (img1[:,1]-img1[:,0]).reshape(img0.size[1], img0.size[0])
picsAsBinaryMatrices.append(img2)
Test code
#plt.imshow(picsAsBinaryMatrices[156]);
#im1 = picsAsBinaryMatrices[15]
#im2 = picsAsBinaryMatrices[16]
#mat1 = np.array(im1.getdata())#.reshape(im1.size[0], im1.size[1], 2)
#mat11= (mat1[:,1]-mat1[:,0]).reshape(im1.size[1], im1.size[0])
#plt.imshow(mat11)
#heights,widths=[],[]
#for pic in picsAsBinaryMatrices:
# heights.append(pic.size[0])
# widths.append(pic.size[1])
Test code. I tried to use PIL but couldn't get eigenvalues, eigenvectors, etc. with it.
So, I made my own registration package (see next section).
#cropHeight = min(heights)
#cropWidth = min(widths)
#cropParams = [0,0,cropHeight,cropWidth]
#im1 = im1.crop(cropParams)
#im2 = im2.crop(cropParams)
#Image.blend(im1,im2,alpha=0.5)
#newIm = picsAsBinaryMatrices[0]
#newIm = newIm.crop(cropParams)
#for pic in picsAsBinaryMatrices:
# tempPic = pic.crop(cropParams)
# newIm = Image.blend(newIm,tempPic,alpha=0.25)
##test
#newIm = np.add(picsAsBinaryMatrices[0],picsAsBinaryMatrices[1])
#newIm = np.add(newIm,picsAsBinaryMatrices[4])
#plt.imshow(newIm)
#REGISTRATION, using cross-correlation.
from scipy import signal
from scipy import misc
Test code
#decRate = 20
#sig0 = [[0,0,0],[0,1,0],[0,0,0]]
#sig1 = picsAsBinaryMatrices[0][85:135,125:175]
#sig2 = picsAsBinaryMatrices[1][80:130,100:150]
#decSig1 = sp.misc.imresize(sig1, decRate, interp='nearest', mode=None)
#decSig2 = sp.misc.imresize(sig2, decRate, interp='nearest', mode=None)
#xcorr = sp.signal.correlate2d(sig0,sig0, mode='full', boundary='symm')
#print xcorr
#print
#print xcorr.shape
#plt.imshow(xcorr);
#plt.imshow(decSig1)
#subset4= picsAsBinaryMatrices[testnum][50:(50+framesize),50:(50+framesize)]
#subset5= picsAsBinaryMatrices[refnum][50:(50+framesize),50:(50+framesize)]
#xcorr = sp.signal.correlate2d(picsAsBinaryMatrices[refnum],subset4, mode='same', boundary='symm')
#print (picsAsBinaryMatrices[refnum]).shape
#decimated = sp.misc.imresize(picsAsBinaryMatrices[0], 10, interp='bilinear', mode=None)
#plt.imshow(decimated)
refnum = 6
testnum = 17
framesize = 150
#mirror image borders
def mirror_image(im):
im2 = np.concatenate((im[:,::-1],im),axis=1)
im2 = np.concatenate((im2,im[:,::-1]),axis=1)
mirrim = np.concatenate((im2,im2[::-1,:]),axis=0)
mirrim = np.concatenate((im2[::-1,:],mirrim),axis=0)
return mirrim
#find index of max cross-correlation
#from http://stackoverflow.com/questions/21989513/finding-index-of-maximum-value-in-array-with-numpy
def nanargmax(a):
idx = np.argmax(a, axis=None)
multi_idx = np.unravel_index(idx, a.shape)
if np.isnan(a[multi_idx]):
nan_count = np.sum(np.isnan(a))
# In numpy < 1.8 use idx = np.argsort(a, axis=None)[-nan_count-1]
idx = np.argpartition(a, -nan_count-1, axis=None)[-nan_count-1]
multi_idx = np.unravel_index(idx, a.shape)
return multi_idx
def crop_pic(im,optpx): #image to align, optimal translation coordinates w.r.t. to some defined reference image
fsz=0#110 #107 worked with ref 6 and num 4
#standard image size: (400, 267)
px = optpx[1]-fsz
py = optpx[0]-fsz
x_std = im.shape[1]
y_std = im.shape[0]
print x_std+px,x_std*2+px
print y_std+py,y_std*2+py
aligned = mirror_image(im)[(y_std-py):(y_std*2-py),(x_std-px):(x_std*2-px)] #y, then x. it's weird I know but yes.
#crops image to original size about that good part. uses image mirroring.
return aligned
Analysis goal: find eigenvalues & eigenvectors and then do PCA or something.
def crop_to_min_size(im,minsz=(400,267)):
oldszx,oldszy = im.shape
newszx,newszy = minsz
newx0,newy0 = 0,0
newxf,newyf = minsz
if oldszx > newszx:
newx0 = 0
newxf = newx0 + newszx
if oldszy > newszy:
newy0 = (oldszy-newszy)/2
newyf = newy0 + newszy
cropped = im[newx0:newxf,newy0:newyf]
return cropped
def crop_pic(im,optpx,fc): #image to align, optimal translation coordinates w.r.t. to some defined reference image
px = optpx[1]-fc[1]
py = optpx[0]-fc[0]
x_std = im.shape[1]
y_std = im.shape[0]
aligned = mirror_image(im)[(y_std-py):(y_std*2-py),(x_std+px):(x_std*2+px)] #y, then x. it's weird I know but yes.
#crops image to original size about that good part. uses image mirroring.
return aligned
def align_to_ref(imn,refn,fsz): #image number, ref number.
myim = crop_to_min_size(picsAsBinaryMatrices[imn]) #make smallest size so can add all
refim = picsAsBinaryMatrices[refn]
fs = framesize; #frame size
fc = fs/2,np.rint((myim.shape[1])/2) #frame center
select_face = (fc[0]-fs/2),(fc[0]+fs/2),(fc[1]-fs/2),(fc[1]+fs/2)
subset= myim[select_face[0]:select_face[1],select_face[2]:select_face[3]]
xcorr = sp.signal.correlate2d(refim,subset, mode='same', boundary='symm') #takes time
minx,miny = xcorr.shape
optpix = nanargmax(xcorr[0:miny/2][0:minx/2]) #best way to translate the picture for alignment
aligned_image = crop_pic(myim,optpix,fc)
return aligned_image
Test code.
You can see, by running this block, that we're actually doing something with the cross-correlation; not all of the work was done heuristically in align_to_ref
. Nice.
#aligned = []
#for i in range(146,149):
# aligned.append(align_to_ref(i,6,framesize))
#
#fig = plt.figure(figsize=(15,30))
#for a,j,i in zip(aligned,range(1,18),range(146,149)):
# s=fig.add_subplot(5,4,j)
# s.imshow(a+crop_to_min_size(picsAsBinaryMatrices[i]))
#meanpic = np.mean([crop_to_min_size(a) for a in aligned], axis = 0)
#plt.imshow(meanpic)
This took my computer about 29 minutes.
framesize = 200
import time
start = time.time()
alignedSet = []
for thisPic in range(0,len(picsAsBinaryMatrices)):
#using image 6 as a reference because she is pretty centered in the frame
alignedSet.append(align_to_ref(thisPic,6,framesize))
print "Aligning image ", thisPic+1, " out of ", len(picsAsBinaryMatrices)
end = time.time()
print end - start
for thisPic,count in zip(alignedSet,viableIndices):
np.save("aligned_pics/aligned%d.npy" % count,thisPic) #saved as numpy arrays
Retrieve from local machine.
import sys
sys.path.append('Users/Heal/Documents/15FClasses/Data\Science/MR-DC-KH-Final-Project/aligned_pics')
tomat = np.zeros((400,267),dtype=int)
alignedSet = []
for count in viableIndices:
tomat = np.load("aligned_pics/aligned%d.npy" % count)
alignedSet.append(tomat)
Remove that one, too-small group shot.
alignedSetNew = alignedSet
#find the groupshot
for thisPic,count in zip(alignedSetNew,range(0,len(alignedSetNew))):
if thisPic.shape[0]<400:
#delete that groupshot
alignedSetNew = np.delete(alignedSetNew,count)
viableIndices = np.delete(viableIndices,count)
allNames = np.delete(allNames,count)
It turns out that the images are too high-res and large. There are too many pixels, and thus the dimension (400x267) is too high given the number of samples (182) that we have. Fortunately, we have registered the images, so we know where we can safely crop the photos. Let's try x=[25:250], y=[50:250]
from scipy.signal import decimate
smallSet = []
downsampled = []
for a in alignedSetNew:
newSmall = a[75:200,75:200]
smallSet.append(newSmall)
downsampled.append(decimate(newSmall,10))
print downsampled[0].shape[0]*downsampled[0].shape[1]
print smallSet[0].shape[0]*smallSet[0].shape[1]
We didn't end up downsampling after all. but we could have!
This is the average of all the faces. Not too descriptive, since we have so many images and our alignment wasn't perfect.
alignedMean = np.mean(smallSet,axis=0); #zoom in on the face
plt.imshow(alignedMean);
alignedMean = np.mean(alignedSet,axis=0); #overall image
plt.imshow(alignedMean);
"Hi, I'm a terrifying demon face" -Average of the matrices
alignedSetNew=smallSet
dims = alignedSetNew[0].shape
vecLen = dims[0]*dims[1]
matsAsVects=[]
for im in alignedSetNew:
matsAsVects.append(np.reshape(im,vecLen))
#now matsAsVects is a vector of 182 of these 106800-vectors.
#X = np.concatenate(np.transpose([matsAsVects]),axis=1)
Y = np.transpose(np.concatenate(np.transpose([smallSet])))
X = Y[0,:,:]
#Bootstrappin'.
numSamps = X.shape[0]
indboot = np.random.choice(range(0,numSamps),vecLen-numSamps)
Xboot = []
count = 0
for i in indboot:
Xboot.append(X[i])
count = count +1
Xboot = np.concatenate([X,Xboot])
Xboot.shape
#from Lab 5
# miniblocks =[]
# testsize =10000
# for x in Xboot[0:(testsize^2)]:
# miniblocks.append(x[0:testsize][0:testsize])
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
First principal component of a winner? So we can have a side-by-side comparison.
WHAT THE WHAT this thing took an hour and a half!
import time
start = time.time()
Xvec = pca.fit_transform(Xboot)
end = time.time()
print "That took " ,end - start, " seconds."
np.save("pca_copy",pca) #ok let's save this so we never have to run it again
pca.components_[0] #should be equal to the mean of the faces
#from Lab 5
testsize = 15625
def normit(a):
a=(a - a.min())/(a.max() -a.min())
a=a*256
return np.round(a)
def getNC(pc, j):
size=testsize*testsize
r=pc.components_[j]#[0:size:3]
r=normit(r)
return r
def display_component(pc, j):
r = getNC(pc,j)
r = np.array(r)
myArray = np.zeros((np.sqrt(testsize)), 'uint8')
myArray = np.transpose(np.reshape(r,(np.sqrt(testsize),np.sqrt(testsize))))
plt.imshow(myArray)
plt.xticks([])
plt.yticks([])
def display_pc(pc, j,ax):
r = getNC(pc,j)
r = np.array(r)
myArray = np.zeros((np.sqrt(testsize)), 'uint8')
myArray = np.transpose(np.reshape(r,(np.sqrt(testsize),np.sqrt(testsize))))
ax.imshow(myArray)
#display_component(pca,2)
fig = plt.figure(figsize=(20,10))
count = 0
for r in range(0,10):
s=fig.add_subplot(2,5,r+1)
s.set_title("PC %d" % r)
s.set_xlabel("Frac. Var. Expl.: %f" % pca.explained_variance_ratio_[r] )
s.set_xticks([])
s.set_yticks([])
display_pc(pca,r,s)
Approximately 70% of the variance can be explained by these top 10 principal components.
len(Xvec[0:182,i]),len(allNames)
##season winners
win_ind = [46,52,81,106,129,181]
##first runners up
run_ind = [10,43,50,92,119,140,157]
##eliminated round 1
season13elim = [0,1,2,4,5,13,16,18,19,21]
season14elim = [22,26,27,31,33,37,38,41,42,44]
season15elim = [48,51,53,55,58,59,62,69,70,71]
season16elim = [75,77,82,86,93,99]
season17elim = [103,104,111,113,115,116]
season18elim = [151,149,146,142,135,134,133,130,126]
season19elim = [153,158,160,167,169,172,173,175]
elim_ind = np.concatenate([season13elim,season14elim,season15elim,season16elim,season17elim,season18elim,season19elim])
##AFTER the PCA process, we add success labels
winners = np.zeros((len(allNames),)) # 0: all others
winners[win_ind] = [1,1,1,1,1,1] #1: season winners
winners[run_ind] = [.5,.5,.5,.5,.5,.5,.5] #0.5: runners up
winners[elim_ind] = [-1,-1,-1,-1,-1,-1,-1] #-1: elim first week
df = pd.DataFrame({"name":allNames,"winner":winners})
for i in range(pca.explained_variance_ratio_.shape[0]):
df["pc%i" % (i+1)] = Xvec[0:182,i]
df.head()
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
c0=sns.color_palette()[0]
c1=sns.color_palette()[1]
c2=sns.color_palette()[2]
c3=sns.color_palette()[3]
c4=sns.color_palette()[4]
colors = [c1,c0,c4,c2]
for label, color, transp in zip(df['winner'].unique(), colors,[0.9,0.9,0.9,0.9]):
mask = df['winner']==label
plt.scatter(df[mask]['pc1'], df[mask]['pc2'], c=color, label=label,s=400,alpha=transp);
plt.legend(["Eliminated Round 1","Eliminated Round 2+","Runners Up","Season Winners"]);
plt.title("Relationship Between Principal Components and Success in 'The Bachelor'")
plt.xlabel("First Principal Component");
plt.ylabel("Second Principal Component");
Whoa! Linearly separable RunnersUp/Winners!
##plot our winners/losers
fig1 = plt.figure(figsize=(40,30))
fig1.suptitle("Season Winners",fontsize=40)
count = 0
for a in win_ind:
count = count+1
s=fig1.add_subplot(10,10,count)
s.imshow(alignedSetNew[int(a)])
s.set_xticks([])
s.set_yticks([])
fig2 = plt.figure(figsize=(40,30))
fig2.suptitle("First Runners Up",fontsize=40)
count = 0
for a in run_ind:
count = count+1
s=fig2.add_subplot(10,10,count)
s.imshow(alignedSetNew[int(a)])
s.set_xticks([])
s.set_yticks([])
fig3 = plt.figure(figsize=(20,20))
fig3.suptitle("Eliminated Round 1",fontsize=40)
count = 0
for a in elim_ind:
count = count+1
s=fig3.add_subplot(10,10,count)
s.imshow(alignedSetNew[int(a)])
s.set_xticks([])
s.set_yticks([])
As you can see, the alignment wasn't perfect. Given more time, we could do much better. But, when comparing to the pre-aligned images, the alignment did help immensely. In terms of the alignment, given more time we would like to explore rotations and fixed-ratio scaling.
Now we put everything in a nice dictionary so that we can use it with our other analyses.
##just checking stuff
#print [df['name'][j] for j in win_ind]
#[(i, allNames[i]) for i in range(0,len(allNames))]
print df.head()
seasons=[13,14,15,16,17,18,19]
# allDict = pd.DataFrame({"Season":seasons})
# print allDict.head()
##these indices corresp. to the list above, AFTER viableindices has been implemented.
s13=range(0,22)
s14=range(22,47)
s15=range(47,75)
s16=range(75,101)
s17=range(101,125)
s18=range(125,152)
s19=range(152,181)
s = [s13,s14,s15,s16,s17,s18,s19]
seasonDicts=[]
for season,inds in zip(seasons,s):
seasonNames = [allNames[i] for i in inds]
seasonPC01 = [(df['pc1'][i],df['pc2'][i]) for i in inds]
seasonDicts.append( dict(zip(seasonNames,seasonPC01)) )
allDict = dict(zip(seasons,seasonDicts))
import csv
with open('theBachelorPCA_dict.csv','wb') as f:
w = csv.writer(f)
w.writerows(allDict.items())