-
Notifications
You must be signed in to change notification settings - Fork 57
/
Copy pathwebmention.py
201 lines (175 loc) · 7.75 KB
/
webmention.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
"""Base handler class and common utilities for handling webmentions.
Used in publish.py and blog_webmention.py.
Webmention spec: http://webmention.org/
"""
import logging
from flask import jsonify, request
from flask.views import View
from google.cloud import error_reporting
from oauth_dropins.webutil.util import json_dumps, json_loads
from oauth_dropins.webutil import flask_util
import werkzeug.exceptions
from flask_app import app
import util
logger = logging.getLogger(__name__)
@app.route('/publish/<any(bluesky,flickr,github,mastodon):silo>',
methods=['GET', 'HEAD'])
@flask_util.headers({'Cache-Control': 'public, max-age=86400'})
def webmention_get_or_head(silo):
"""Serves webmention discovery for HEADs to webmention endpoints."""
return f"""\
<!DOCTYPE html>
<html><head>
<link rel="webmention" href="{util.host_url('/publish/webmention')}">
</head>
<body>Nothing here! <a href="/about">Try the docs instead.</a></body>
</html>""", {
'Link': f'<{util.host_url("/publish/webmention")}>; rel="webmention"',
}
class Webmention(View):
"""Webmention base view.
Attributes:
* source (models.Source): for this webmention
* entity (models.Publish or models.Webmention) entity for this webmention
"""
source = None
entity = None
def fetch_mf2(self, url, id=None, require_mf2=True, raise_errors=False):
"""Fetches a URL and extracts its mf2 data.
Side effects: sets ``entity.html`` on success, calls :attr:`error` on
errors.
Args:
url: str
id: str, optional id of specific element to extract and parse. defaults
to the whole page.
require_mf2: boolean, whether to return error if no mf2 are found
raise_errors: boolean, whether to let error exceptions propagate up or
handle them
Returns:
(requests.Response, mf2 data dict) tuple:
"""
try:
resp = util.requests_get(url)
resp.raise_for_status()
except werkzeug.exceptions.HTTPException:
# raised by us, probably via self.error()
raise
except BaseException as e:
if raise_errors:
raise
util.interpret_http_exception(e) # log exception
self.error(f'Could not fetch source URL {url}')
if self.entity:
self.entity.html = resp.text
# parse microformats
soup = util.parse_html(resp)
mf2 = util.parse_mf2(soup, url=resp.url, id=id)
if id and not mf2:
self.error(f'Got fragment {id} but no element found with that id.')
# special case tumblr's markup: div#content > div.post > div.copy
# convert to mf2 and re-parse
if not mf2.get('items'):
contents = soup.find_all(id='content')
if contents:
post = contents[0].find_next(class_='post')
if post:
post['class'] = 'h-entry'
copy = post.find_next(class_='copy')
if copy:
copy['class'] = 'e-content'
photo = post.find_next(class_='photo-wrapper')
if photo:
img = photo.find_next('img')
if img:
img['class'] = 'u-photo'
# TODO: i should be able to pass post or contents[0] to mf2py instead
# here, but it returns no items. mf2py bug?
doc = str(post)
mf2 = util.parse_mf2(doc, resp.url)
logger.debug(f'Parsed microformats2: {json_dumps(mf2, indent=2)}')
items = mf2.get('items', [])
if require_mf2 and (not items or not items[0]):
self.error('No microformats2 data found in ' + resp.url, data=mf2, html=f"""
No <a href="http://microformats.org/get-started">microformats</a> or
<a href="http://microformats.org/wiki/microformats2">microformats2</a> found in
<a href="{resp.url}">{util.pretty_link(resp.url)}</a>! See <a href="http://indiewebify.me/">indiewebify.me</a>
for details (skip to level 2, <em>Publishing on the IndieWeb</em>).
""")
return resp, mf2
def error(self, error, html=None, status=400, data=None, log_exception=False,
report=False, extra_json=None, http_response=True):
"""Handle an error. May be overridden by subclasses.
Args:
error (str): human-readable error message
html (str): HTML human-readable error message
status (int): HTTP response status code
data (dict): mf2 data parsed from source page
log_exception (bool): whether to include a stack trace in the log msg
report (bool): whether to report to StackDriver Error Reporting
extra_json (dict): to be merged into the JSON response body
http_response (bool): whether to returning an error HTTP response
"""
if self.entity and self.entity.status == 'new':
self.entity.status = 'failed'
self.entity.put()
resp = {'error': error}
if data:
resp['parsed'] = data
if extra_json:
assert 'error' not in extra_json
assert 'parsed' not in extra_json
resp.update(extra_json)
if report and status != 404:
self.report_error(error, status=status)
if http_response:
flask_util.error(str(resp), status=status, response=jsonify(resp),
exc_info=log_exception)
def report_error(self, resp, status=None):
"""Report an error to StackDriver Error reporting."""
# don't report specific known failures
if ('Deadline exceeded while waiting for HTTP response' in resp or
'urlfetch.Fetch() took too long' in resp or
# WordPress Jetpack bugs
# https://github.com/snarfed/bridgy/issues/161
'"resp": "invalid_input"' in resp or
# https://github.com/snarfed/bridgy/issues/750
'"error": "jetpack_verification_failed"' in resp or
# https://console.cloud.google.com/errors/CMjIg52NkMLQYA?project=brid-gy
'The Jetpack site encountered an error and could not process the API request' in resp or
# https://console.cloud.google.com/errors/CL6xvLS7k6qE3QE?project=brid-gy
'The Jetpack site is inaccessible or returned an error' in resp or
# Blogger known bug
# https://github.com/snarfed/bridgy/issues/175
'bX-2i87au' in resp or
# Tumblr: transient Disqus error looking up thread
# https://github.com/snarfed/bridgy/issues/177
"Invalid argument, 'thread': Unable to find thread" in resp or
# expected for partially set up tumblr accounts
"we haven't found your Disqus account" in resp or
# Twitter 5MB image file size limit
'"message":"Image file size must be' in resp or
# Twitter media file number limits
'Tweet with media must have exactly 1 gif or video' in resp or
# Facebook image type/size req'ts
'Missing or invalid image file' in resp or
"Your photos couldn't be uploaded. Photos should be less than 4 MB" in resp or
# Twitter duplicate publish attempts
'Status is a duplicate.' in resp or
'You have already favorited this status.' in resp or
'You have already retweeted this' in resp or
# Facebook duplicate publish attempts
'This status update is identical to the last one you posted.' in resp or
# WordPress duplicate comment
# "error": "Error: 409 HTTP Error 409: Conflict; {\n \"error\": \"comment_duplicate\",\n \"message\": \"Duplicate comment detected; it looks as though you’ve already said that!\"\n}\n"
'comment_duplicate' in resp):
return
subject = '%s %s' % (self.__class__.__name__,
'%s %s' % (self.entity.type, self.entity.status)
if self.entity else 'failed')
user = self.source.bridgy_url() if self.source else None
util.report_error(subject, user=user,
http_context=error_reporting.HTTPContext(
method=request.method,
url=request.url,
response_status_code=status,
remote_ip=request.remote_addr))