forked from akhilketkar/cs209-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
127 lines (96 loc) · 3.96 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
__author__ = 'Akhil'
import numpy as np
import matplotlib.pyplot as plt
import json
import urllib2
import bs4
import pandas as pd
import lxml.html as lh
from datetime import datetime as dt
import time
import sklearn.linear_model
import statsmodels.api as sm
from patsy import dmatrices
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
apiKey = "u8j7q6zesvbf2mb44abmhfdp"
apiSuffix = "?apikey=" + apiKey
pageLimitSuffix = "&page_limit="
querySuffix ="&q="
def getRTData():
movieListUrl = "http://api.rottentomatoes.com/api/public/v1.0/lists/movies.json"
dvdListUrl = "http://api.rottentomatoes.com/api/public/v1.0/lists/dvds.json"
movieDict = {}
# get list of movie lists
# top_rentals, current_releases, upcomning etc
response = urllib2.urlopen(movieListUrl+apiSuffix)
jsonText = response.read()
data = json.loads(jsonText)
links = data["links"]
for linkTitle,linkUrl in links.items():
response = urllib2.urlopen(linkUrl+apiSuffix+pageLimitSuffix+str(50))
movies = json.loads(response.read())["movies"]
# loop over movies
for movie in movies:
title = movie["title"]
revUrl = movie["links"]["reviews"]
movieDict[title] = revUrl
# reviews = json.loads(urllib2.urlopen(revUrl+apiSuffix+pageLimitSuffix+str(50)).read())["reviews"]
# loop over the reviews for each movie
# for r in reviews:
# print (title,r["critic"],r["publication"],r["freshness"])
# get list of dvd lists
# top_rentals, current_releases, upcomning etc
response = urllib2.urlopen(dvdListUrl+apiSuffix)
jsonText = response.read()
data = json.loads(jsonText)
links = data["links"]
for linkTitle,linkUrl in links.items():
response = urllib2.urlopen(linkUrl+apiSuffix+pageLimitSuffix+str(50))
movies = json.loads(response.read())["movies"]
# loop over movies
for movie in movies:
title = movie["title"]
revUrl = movie["links"]["reviews"]
movieDict[title] = revUrl
print movieDict.keys()
def word_feats(words):
return dict([(word, True) for word in words])
def trainClassifer():
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
classifier = NaiveBayesClassifier.train(trainfeats)
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
classifier.show_most_informative_features()
return classifier
def regressionAnalysis():
# load data for 2013
data3 = pd.read_csv("movieRTAndBudgetData.csv")
data3 = data3.dropna()
data3 = data3[(data3["Worldwide"] > 0 ) & (data3["CriticsScore"] > 0)]
data3["PR"] = data3["Worldwide"] / data3["Budget"]
data3["lW"] = np.log(data3["Worldwide"])
data3["lB"] = np.log(data3["Budget"])
# Use sklearn
# model = sklearn.linear_model.LinearRegression()
# model.fit(data3[["Budget","AudScore","CriticsScore"]].as_matrix(),data3[["Worldwide"]].as_matrix())
# print model.intercept_,model.coef_
minBudgetIndie = 0
y, X = dmatrices("lW ~ lB + AudScore", data=data3[data3["Budget"]> minBudgetIndie], return_type="dataframe")
model = sm.OLS(y, X)
res = model.fit()
print res.summary()
if __name__ == "__main__":
classifier = trainClassifer()
words = ["great","good","bad","john","sarcastic"]
probs = classifier.prob_classify_many([word_feats(w) for w in words])
for p,w in zip(probs,words):
print w,p.prob("pos")