-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsimilarity2.py
71 lines (54 loc) · 1.58 KB
/
similarity2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from imdb import IMDb
import pandas as pd
import numpy as np
import csv
import pickle
import string
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
ia = IMDb()
lem=WordNetLemmatizer()
bad_words=[',','.','(',')','+','-','_','/','|','[',']','*',
'&','^','%','$','#','@','!',':',';','','"','" "',' ']
stop_words=set(stopwords.words("english"))
numbers=[]
decimal=[]
for i in range(0,10000):
numbers.append(str(i))
for i in np.arange(0,10,0.1):
decimal.append(str(i))
top=ia.get_top250_movies()
##movie = ia.get_movie('0133093')
##ia.update(movie,info=['critic reviews','photo sites','review'])
##print(movie.infoset2keys)
##if 'arithmetic mean' in movie:
## print(movie['reviews'][0]['content'])
l=[]
movies={}
inc=0
not1=[]
for i in range(0,250):
movie=top[i]
a=ia.search_movie(str(movie))
b=a[0].movieID
mov=ia.get_movie(str(b))
ia.update(mov,info=['plot'])
try:
sent=sent_tokenize(mov['synopsis'][0])
for i in mov['cast']:
l.append(str(i))
movies[str(mov)]=l
inc=inc+1
l=[]
print(inc)
except:
l=[]
not1.append(mov)
print("no cast")
##with open("movie_words2.pickle",'wb') as dic:
## pickle.dump(movies,dic,protocol=pickle.HIGHEST_PROTOCOL)
with open("movies_cast.pickle",'wb') as dic1:
pickle.dump(movies,dic1,protocol=pickle.HIGHEST_PROTOCOL)
print("done")