-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspider.rb
148 lines (120 loc) · 3.21 KB
/
spider.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# Remove all your facebook actitity with this simple webcrawler in Ruby
#
# Crawler base from https://rossta.net/blog/how-to-write-a-simple-web-crawler-in-ruby-revisited.html
#
# Requirements:
# Ruby 2.0+
#
require "mechanize"
require "pry"
class Spider
REQUEST_INTERVAL = 0.2
MAX_URLS = 10000
attr_reader :handlers
def initialize(processor, options = {})
@processor = processor
@options = options
@results = []
@urls = []
@handlers = {}
@interval = options.fetch(:interval, REQUEST_INTERVAL)
@max_urls = options.fetch(:max_urls, MAX_URLS)
enqueue(@processor.root, @processor.handler)
end
def enqueue(url, method, data = {})
return if @handlers[url]
@urls << url
@handlers[url] ||= { method: method, data: data }
end
def record(data = {})
@results << data
end
def results
return enum_for(:results) unless block_given?
i = @results.length
enqueued_urls.each do |url, handler|
begin
log "Calling", url.inspect
@processor.send(handler[:method], agent.get(url), handler[:data])
if block_given? && @results.length > i
yield @results.last
i += 1
end
rescue => ex
log "Error", "#{url.inspect}, #{ex}"
end
sleep @interval if @interval > 0
end
end
private
def enqueued_urls
Enumerator.new do |y|
index = 0
while index < @urls.count && index <= @max_urls
url = @urls[index]
index += 1
next unless url
y.yield url, @handlers[url]
end
end
end
def log(label, info)
warn "%-10s: %s" % [label, info]
end
def agent
@agent ||= (
agent = Mechanize.new
if File.exist?("cookies.yaml")
agent.cookie_jar.load("cookies.yaml")
else
log "Logging in", "..."
login_page = agent.get("https://mbasic.facebook.com/login.php")
form = login_page.forms.first
form.email = @options[:email]
form.pass = @options[:pass]
agent.submit(form)
agent.cookie_jar.save("cookies.yaml", session: true)
end
agent
)
end
end
class ProgrammableWeb
attr_reader :root, :handler
def initialize(root, **options)
@root = root
@handler = :process_index
@options = options
end
def process_index(page, data = {})
page.links_with(href: /\/allactivity\?timeend=/).each do |link|
spider.enqueue(link.href, :process_index)
end
page.links_with(href: /\/allactivity\/removecontent/).each do |link|
spider.enqueue(link.href, :delete)
end
page.links_with(href: /\/allactivity\/delete/).each do |link|
spider.enqueue(link.href, :delete)
end
end
def delete(page, data = {})
spider.record page.uri
end
def results(&block)
spider.results(&block)
end
private
def spider
@spider ||= Spider.new(self, @options)
end
end
if __FILE__ == $0
email = "***"
pass = "***"
fid = "***"
spider = ProgrammableWeb.new("https://mbasic.facebook.com/#{fid}/allactivity", email: email, pass: pass)
# spider.results.lazy.take(5).each_with_index do |result, i|
spider.results.each_with_index do |result, i|
warn "%-2s: %s" % [i, result.inspect]
end
end