-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathamazon_movie_sorter.py
42 lines (34 loc) · 1.27 KB
/
amazon_movie_sorter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# Reads movies.txt.gz
# Sorts data by date
# Stores as pickle
# - list of keys: helpfulness, score, time, number
# - list of texts (concatenated fields summary and text)
import os
import pickle
from datetime import datetime
import yaml
from word2vec.amazon_reviews_reader import AmazonReviewsReader
import time
#config = yaml.safe_load(open("config.yaml", 'r'))
amazon_gz_file = "data/movies/movies.txt.gz"
amazon_raw_file = "data/movies/embeddings/amazon_raw.pickle"
max_docs = -1 # -1 for all
text_list = []
key_list = []
ident = []
timeBegin = time.time()
print("Begin", time.asctime())
print("Reading", amazon_gz_file)
for item in AmazonReviewsReader(amazon_gz_file, "fields", max_docs=max_docs):
ident.append(item['helpfulness'])
ident.append(int(float(item['score'])))
ident.append(datetime.fromtimestamp(int(item['time'])))
ident.append(item['number'])
key_list.append(ident)
ident = []
text_list.append((item['summary'] + " " + item['text']).replace('<br />', ' '))
text_list, key_list = (list(t) for t in zip(*sorted(zip(text_list, key_list), key=lambda x: x[1][-2])))
print("Writing", amazon_raw_file)
with open(amazon_raw_file, 'wb') as handle:
pickle.dump((text_list, key_list), handle)
print("Runtime (secs)", time.time()-timeBegin)