-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtwitter_scraper.py
86 lines (77 loc) · 3.8 KB
/
twitter_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# requires python 3.8+
# pip install git+https://github.com/JustAnotherArchivist/snscrape
# no credentials needed
# expect 200-400k tweets/hour (lower bound from EC2)
# use csv.reader(dialect='excel-tab') to parse (due to quotations and newlines)
import argparse
from collections import Counter
import csv
import os
import re
import time
import snscrape.modules.twitter as sntwitter
parser = argparse.ArgumentParser()
parser.add_argument('--lang', default='he', help='See https://developer.twitter.com/en/docs/twitter-for-websites/supported-languages')
parser.add_argument('--limit', type=int, help='Number of tweets to fetch')
parser.add_argument('--total', type=int, help='Number of tweets of tweets')
parser.add_argument('--delay_min', type=int, default=10, help='Minutes to wait between retries if results stop early')
parser.add_argument('--verbose', type=int, default=500000, help='Number of lines to accumulate between progress printouts')
args = parser.parse_args()
print(args)
headers = ['id', 'datetime', 'username', 'reply_to', 'quote_of', 'replies', 'retweets', 'quotes', 'likes', 'content']
dialect = csv.excel_tab()
dialect.strict = True
latest = None
found = 0
years = Counter()
maxid_arg = ''
with open('tweets_%s.tsv' % args.lang, 'a+', encoding='utf8', newline='') as f:
f.seek(0)
for found, row in enumerate(csv.reader(f, dialect=dialect), start=1):
if found > 1:
if found == 2:
latest = row[1]
years[row[1][:4]] += 1
writer = csv.writer(f, dialect=dialect)
if found:
found -= 1 # account for headers
assert len(row) == len(headers), (len(row), len(headers))
maxid_arg = ' max_id:%d' % (int(row[0])-1)
date = row[1]
print('found %d tweets. earliest: %s. latest: %s' % (found, date, latest))
elif not found:
writer.writerow(headers)
start = time.time()
i = 0
skip = 0
first = True
while (not args.limit or i < args.limit) and (not args.total or found + i < args.limit):
if not first:
print('Error getting tweets. Will wait %d min. before continuing' % args.delay_min)
time.sleep(args.delay_min * 60)
print('Continuing...')
first = False
try:
for tweet in sntwitter.TwitterSearchScraper('lang:' + args.lang + maxid_arg).get_items():
if tweet.content.endswith((' has been withheld in response to a report from the copyright holder. Learn more.', '\'s account is temporarily unavailable because it violates the Twitter Media Policy. Learn more.')) or args.lang == 'he' and not re.search('[א-ת]', tweet.content):
skip += 1
continue
date = str(tweet.date).split('+')[0]
if not latest:
latest = date
years[row[1][:4]] += 1
writer.writerow([tweet.id, date, tweet.user.username, tweet.inReplyToTweetId, tweet.quotedTweet.id if tweet.quotedTweet is not None else None, tweet.replyCount, tweet.retweetCount, tweet.quoteCount, tweet.likeCount, tweet.content.replace('\r\n', '\n').replace('\r', '\n').replace('\x00', '')])
f.flush()
os.fsync(f.fileno())
maxid_arg = ' max_id:%d' % (int(tweet.id) - 1)
i += 1
if i == args.limit or found + i == args.total:
break
if args.verbose and not i % args.verbose:
print('got %d tweets in %.2f hours (skipped: %d). earliest tweet: %s' % (i, (time.time()-start) / 3600, skip, date))
break
except Exception as e:
print(e)
print('got %d tweets in %.2f hours (skipped: %d)' % (i, (time.time()-start) / 3600, skip))
print('total tweets: %d. earliest: %s latest: %s.' % (found + i, date, latest))
print(sorted(years.items()))