-
Notifications
You must be signed in to change notification settings - Fork 44
/
Copy pathweb.py
1042 lines (857 loc) · 37.7 KB
/
web.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Webmention protocol with microformats2 in HTML, aka the IndieWeb stack."""
from datetime import timedelta, timezone
import difflib
import logging
import re
import statistics
import urllib.parse
from urllib.parse import quote, urlencode, urljoin, urlparse
from xml.etree import ElementTree
import brevity
from flask import redirect, render_template, request
from google.cloud import ndb
from google.cloud.ndb import ComputedProperty
from granary import as1, as2, atom, microformats2, rss
import mf2util
from oauth_dropins.webutil import flask_util, util
from oauth_dropins.webutil.appengine_config import tasks_client
from oauth_dropins.webutil import appengine_info
from oauth_dropins.webutil.flask_util import cloud_tasks_only, error, flash
from oauth_dropins.webutil.util import domain_from_link, json_dumps, json_loads
from oauth_dropins.webutil import webmention
from requests import HTTPError, RequestException
from requests.auth import HTTPBasicAuth
from werkzeug.exceptions import BadGateway, BadRequest, HTTPException, NotFound
import common
from common import (
CACHE_CONTROL,
DOMAIN_RE,
DOMAINS,
PRIMARY_DOMAIN,
PROTOCOL_DOMAINS,
SUPERDOMAIN,
)
from flask_app import app
from ids import normalize_user_id, translate_object_id, translate_user_id
import memcache
from models import Follower, Object, PROTOCOLS, Target, User
from protocol import Protocol
logger = logging.getLogger(__name__)
# https://github.com/snarfed/bridgy-fed/issues/314
WWW_DOMAINS = frozenset((
'www.jvt.me',
))
FEED_TYPES = {
atom.CONTENT_TYPE.split(';')[0]: 'atom',
rss.CONTENT_TYPE.split(';')[0]: 'rss',
# https://stackoverflow.com/questions/4832357/whats-the-difference-between-text-xml-vs-application-xml-for-webservice-respons
'application/xml': 'xml',
'text/xml': 'xml',
}
MIN_FEED_POLL_PERIOD = timedelta(hours=2)
MAX_FEED_POLL_PERIOD = timedelta(days=1)
MAX_FEED_PROPERTY_SIZE = 500 * 1000 # Object.atom/rss
MAX_FEED_ITEMS_PER_POLL = 10
# populated into Web.redirects_error
OWNS_WEBFINGER = 'This site serves its own Webfinger, and likely ActivityPub too.'
# in addition to common.DOMAIN_BLOCKLIST
FETCH_BLOCKLIST = (
'bsky.app',
)
def is_valid_domain(domain, allow_internal=True):
"""Returns True if this is a valid domain we can use, False otherwise.
Args:
domain (str):
allow_internal (bool): whether to return True for internal domains
like ``fed.brid.gy``, ``bsky.brid.gy``, etc
Valid means TLD is ok, not blacklisted, etc.
"""
if not domain or not re.match(DOMAIN_RE, domain):
# logger.debug(f"{domain} doesn't look like a domain")
return False
if Web.is_blocklisted(domain, allow_internal=allow_internal):
# logger.debug(f'{domain} is blocklisted')
return False
tld = domain.split('.')[-1]
if tld not in brevity.TLDS:
# logger.info(f"{domain} looks like a domain but {tld} isn't a TLD")
return False
return True
class Web(User, Protocol):
"""Web user and webmention protocol implementation.
The key name is the domain.
"""
ABBREV = 'web'
PHRASE = 'the web'
OTHER_LABELS = ('webmention',)
LOGO_HTML = '🌐' # used to be 🕸️
CONTENT_TYPE = common.CONTENT_TYPE_HTML
DEFAULT_ENABLED_PROTOCOLS = ('activitypub',)
DEFAULT_SERVE_USER_PAGES = True
SUPPORTED_AS1_TYPES = (
tuple(as1.ACTOR_TYPES)
+ tuple(as1.POST_TYPES)
+ tuple(as1.CRUD_VERBS)
+ ('audio', 'bookmark', 'event', 'image', 'video')
+ ('follow', 'like', 'share', 'stop-following')
)
has_redirects = ndb.BooleanProperty()
redirects_error = ndb.TextProperty()
has_hcard = ndb.BooleanProperty()
last_webmention_in = ndb.DateTimeProperty(tzinfo=timezone.utc)
last_polled_feed = ndb.DateTimeProperty(tzinfo=timezone.utc)
feed_last_item = ndb.StringProperty() # id (URL)
feed_etag = ndb.StringProperty()
feed_last_modified = ndb.StringProperty()
# only used by protocol bot users in Bluesky, for polling their chat
# messages with chat.bsky.convo.getLog
atproto_last_chat_log_cursor = ndb.StringProperty()
# Originally, BF served Web users' AP actor ids on fed.brid.gy, eg
# https://fed.brid.gy/snarfed.org . When we started adding new protocols, we
# switched to per-protocol subdomains, eg https://web.brid.gy/snarfed.org .
# However, we need to preserve the old users' actor ids as is.
#
# Also, our per-protocol bot accounts in ActivityPub are on their own
# subdomains, eg @bsky.brid.gy@bsky.brid.gy.
#
# So, this property tracks which subdomain a given Web user's AP actor uses.
ap_subdomain = ndb.StringProperty(
choices=['ap', 'bsky', 'fed', 'web', 'fake', 'other', 'efake'],
default='web')
# OLD. some stored entities still have these; do not reuse.
# superfeedr_subscribed = ndb.DateTimeProperty(tzinfo=timezone.utc)
# superfeedr_subscribed_feed = ndb.StringProperty()
@classmethod
def _get_kind(cls):
return 'MagicKey'
def _pre_put_hook(self):
"""Validate domain id, don't allow upper case or invalid characters."""
super()._pre_put_hook()
id = self.key.id()
assert is_valid_domain(id), id
assert id.lower() == id, f'upper case is not allowed in Web key id: {id}'
@classmethod
def get_or_create(cls, id, allow_opt_out=False, verify=None, **kwargs):
"""Normalize domain, then pass through to :meth:`User.get_or_create`.
Normalizing currently consists of lower casing and removing leading and
trailing dots.
Args:
verify (bool): whether to call :meth:`verify` to load h-card, check
redirects, etc. Defaults to calling it only if the user is new.
"""
# normalize id (domain)
domain = cls.key_for(id, allow_opt_out=True).id()
if util.domain_or_parent_in(domain, [SUPERDOMAIN.strip('.')]):
return super().get_by_id(domain)
user = super().get_or_create(domain, allow_opt_out=True, **kwargs)
if not user:
return None
if verify or (verify is None and not user.existing):
user = user.verify()
if not allow_opt_out and user.status:
return None
if not user.existing:
common.create_task(queue='poll-feed', domain=user.key.id())
return user
@ndb.ComputedProperty
def handle(self):
"""Returns this user's chosen username or domain, eg ``user.com``."""
# prettify if domain, noop if username
username = self.username()
if username != self.key.id():
return domain_from_link(username, minimize=False)
return username
def handle_as(self, to_proto):
"""Special case ActivityPub to use custom username."""
if to_proto in ('activitypub', 'ap', PROTOCOLS['ap']):
return (f'@{self.username()}@{self.key.id()}' if self.has_redirects
else f'@{self.key.id()}@{self.ap_subdomain}{SUPERDOMAIN}')
return super().handle_as(to_proto)
def id_as(self, to_proto):
"""Special case ActivityPub to use ``ap_subdomain``."""
if isinstance(to_proto, str):
to_proto = PROTOCOLS[to_proto]
converted = translate_user_id(id=self.key.id(), from_=self,
to=to_proto)
if to_proto.LABEL == 'activitypub':
other = 'web' if self.ap_subdomain == 'fed' else 'fed'
converted = converted.replace(f'https://{other}.brid.gy/',
f'https://{self.ap_subdomain}.brid.gy/')
return converted
web_url = User.profile_id
def id_uri(self):
return self.web_url()
def is_web_url(self, url):
return super().is_web_url(url, ignore_www=True)
def user_page_path(self, rest=None):
"""Always use domain."""
path = f'/{self.ABBREV}/{self.key.id()}'
if rest:
if not rest.startswith('?'):
path += '/'
path += rest.lstrip('/')
return path
def username(self):
"""Returns the user's preferred username.
Uses stored representative h-card if available, falls back to id.
Returns:
str:
"""
id = self.key.id()
if self.obj and self.obj.as1:
for url in (util.get_list(self.obj.as1, 'url') +
util.get_list(self.obj.as1, 'urls')):
url = url.get('value') if isinstance(url, dict) else url
if url and url.startswith('acct:'):
try:
urluser, urldomain = util.parse_acct_uri(url)
except ValueError as e:
continue
if urldomain == id:
logger.info(f'Found custom username: {urluser}')
return urluser
# logger.debug(f'Defaulting username to key id {id}')
return id
@ndb.ComputedProperty
def status(self):
if self.key.id() in common.DOMAINS:
return None
if self.redirects_error == OWNS_WEBFINGER:
# looks like this site is already its own fediverse server
return 'owns-webfinger'
url, _ = self.feed_url()
if (not url and not self.webmention_endpoint() and not self.last_webmention_in
and not self.has_redirects):
return 'no-feed-or-webmention'
return super().status
def verify(self):
"""Fetches site a couple ways to check for redirects and h-card.
Returns:
web.Web: user that was verified. May be different than self! eg if
self's domain started with www and we switch to the root domain.
"""
domain = self.key.id()
logger.info(f'Verifying {domain}')
if domain.startswith('www.') and domain not in WWW_DOMAINS:
# if root domain serves ok, use it instead
# https://github.com/snarfed/bridgy-fed/issues/314
root = domain.removeprefix('www.')
root_site = f'https://{root}/'
try:
resp = util.requests_get(root_site, gateway=False)
if resp.ok and self.is_web_url(resp.url):
logger.info(f'{root_site} serves ok ; using {root} instead')
root_user = Web.get_or_create(
root, enabled_protocols=self.enabled_protocols,
allow_opt_out=True)
self.use_instead = root_user.key
self.put()
return root_user.verify()
except RequestException as e:
logger.info(f"Couldn't fetch {root_site} : {e}")
logger.info(f"Continuing with {domain}")
pass
# check webfinger redirect
path = f'/.well-known/webfinger?resource=acct:{domain}@{domain}'
self.has_redirects = False
self.redirects_error = None
try:
url = urljoin(self.web_url(), path)
resp = util.requests_get(url, gateway=False)
domain_urls = ([f'https://{domain}/' for domain in common.DOMAINS] +
[common.host_url()])
expected = [urljoin(url, path) for url in domain_urls]
if resp.url:
got = urllib.parse.unquote(resp.url)
if got in expected:
self.has_redirects = True
else:
# check host-meta to see if they serve their own Webfinger
resp = util.requests_get(
urljoin(self.web_url(), '/.well-known/host-meta'),
gateway=False)
if (resp.status_code == 200
and domain_from_link(resp.url) not in common.DOMAINS):
logger.info(f"{domain} serves Webfinger! probably a fediverse server")
self.redirects_error = OWNS_WEBFINGER
else:
diff = '\n'.join(difflib.Differ().compare([got], [expected[0]]))
self.redirects_error = f'Current vs expected:<pre>{diff}</pre>'
else:
lines = [url, f' returned HTTP {resp.status_code}']
if resp.url and resp.url != url:
lines[1:1] = [' redirected to:', resp.url]
self.redirects_error = '<pre>' + '\n'.join(lines) + '</pre>'
except RequestException:
pass
# check home page
self.has_hcard = False
if not getattr(self, 'existing', None) == False: # ie this is a new user
self.reload_profile(gateway=True, raise_=False)
if self.obj and self.obj.as1:
self.has_hcard = True
self.put()
return self
@classmethod
def key_for(cls, id, allow_opt_out=False):
"""Returns the :class:`ndb.Key` for a given id.
If id is a domain, uses it as is. If it's a home page URL or fed.brid.gy
or web.brid.gy AP actor URL, extracts the domain and uses that.
Otherwise, returns None.
Args:
id (str)
allow_opt_out (bool): whether to allow users who are currently opted out
Returns:
ndb.Key or None:
"""
if not id:
return None
id = id.lower().strip('.')
if util.is_web(id):
parsed = urlparse(id)
if parsed.path in ('', '/'):
id = parsed.netloc
if is_valid_domain(id, allow_internal=True):
return super().key_for(id, allow_opt_out=allow_opt_out)
return None
@classmethod
def owns_id(cls, id):
"""Returns True on domains and internal URLs, None on other URLs.
All web pages are http(s) URLs, but not all http(s) URLs are web pages.
"""
if not id:
return False
elif is_valid_domain(id, allow_internal=True):
return True
if not util.is_web(id):
return False
domain = domain_from_link(id)
if domain == PRIMARY_DOMAIN or domain in PROTOCOL_DOMAINS:
return True
# we allowed internal domains for protocol bot actors above, but we
# don't want to allow non-homepage URLs on those domains, eg
# https://bsky.brid.gy/foo, so don't allow internal here
if is_valid_domain(domain, allow_internal=False):
return None
return False
@classmethod
def owns_handle(cls, handle, allow_internal=False):
if handle == PRIMARY_DOMAIN or handle in PROTOCOL_DOMAINS:
return True
elif not is_valid_domain(handle, allow_internal=allow_internal):
return False
@classmethod
def handle_to_id(cls, handle):
assert cls.owns_handle(handle) is not False
return handle
@classmethod
def target_for(cls, obj, shared=False):
"""Returns `obj`'s id, as a URL webmention target."""
# TODO: we have entities in prod that fail this, eg
# https://indieweb.social/users/bismark has source_protocol webmention
# assert obj.source_protocol in (cls.LABEL, cls.ABBREV, 'ui', None), str(obj)
if not util.is_web(obj.key.id()):
logger.warning(f"{obj.key.id()} is source_protocol web but id isn't a URL!")
return None
return obj.key.id()
def feed_url(self):
"""Returns this web site's RSS or Atom feed URL and type, if any.
Returns:
(str, type) or (None, None):
"""
if self.obj and self.obj.mf2:
for url, info in self.obj.mf2.get('rel-urls', {}).items():
type = FEED_TYPES.get(info.get('type', '').split(';')[0])
if 'alternate' in info.get('rels', []) and type:
return url, type
return None, None
def webmention_endpoint(self):
"""Returns this web site's webmention endpoint, if any.
Returns:
str: webmention endpoint URL
"""
if self.obj and self.obj.mf2:
for url, info in self.obj.mf2.get('rel-urls', {}).items():
if 'webmention' in info.get('rels', []):
return url
@classmethod
def send(to_cls, obj, url, from_user=None, orig_obj_id=None, **kwargs):
"""Sends a webmention to a given target URL.
See :meth:`Protocol.send` for details.
Returns False if the target URL doesn't advertise a webmention endpoint,
or if webmention/microformats2 don't support the activity type.
https://fed.brid.gy/docs#error-handling
"""
targets = as1.targets(obj.as1)
if not (url in targets or
# homepage, check domain too
(urlparse(url).path.strip('/') == ''
and domain_from_link(url) in targets)):
logger.debug(f'Skipping sending to {url} , not a target in the object')
return False
if to_cls.is_blocklisted(url):
logger.info(f'Skipping sending to blocklisted {url}')
return False
source_id = translate_object_id(
id=obj.key.id(), from_=PROTOCOLS[obj.source_protocol], to=Web)
source_url = quote(source_id, safe=':/%+')
logger.info(f'Sending webmention from {source_url} to {url}')
# we only send webmentions for responses. for sending normal posts etc
# to followers, we just update our stored objects (elsewhere) and web
# users consume them via feeds.
endpoint = webmention_discover(url).endpoint
if not endpoint:
return False
webmention.send(endpoint, source_url, url)
return True
@classmethod
def load(cls, id, **kwargs):
"""Wrap :meth:`Protocol.load` to convert domains to homepage URLs."""
if re.match(DOMAIN_RE, id):
id = f'https://{id}/'
return super().load(id, **kwargs)
@classmethod
def fetch(cls, obj, gateway=False, check_backlink=False,
authorship_fetch_mf2=True, metaformats=None, **kwargs):
"""Fetches a URL over HTTP and extracts its microformats2.
Follows redirects, but doesn't change the original URL in ``obj``'s id!
:class:`google.cloud.ndb.model.Model` doesn't allow that anyway, but more
importantly, we want to preserve that original URL becase other objects
may refer to it instead of the final redirect destination URL.
See :meth:`Protocol.fetch` for other background.
Args:
gateway (bool): passed through to
:func:`oauth_dropins.webutil.util.fetch_mf2`
check_backlink (bool): optional, whether to require a link to Bridgy
Fed. Ignored if the URL is a homepage, ie has no path.
authorship_fetch_mf2 (bool): optional, when running the authorship
algorithm, fetch author URL if necessary
kwargs: ignored
"""
url = obj.key.id()
if not util.is_web(url):
logger.info(f'{url} is not a URL')
return False
elif (cls.is_blocklisted(url, allow_internal=True)
or util.domain_or_parent_in(domain_from_link(url), FETCH_BLOCKLIST)):
return False
is_homepage = urlparse(url).path.strip('/') == ''
if is_homepage:
domain = domain_from_link(url)
if domain == PRIMARY_DOMAIN or domain in PROTOCOL_DOMAINS:
profile = util.read(f'{domain}.as2.json')
if profile:
obj.as2 = json_loads(profile)
return True
return False
require_backlink = (common.host_url().rstrip('/')
if check_backlink and not is_homepage
else None)
if metaformats is None:
# default to only for homepages
metaformats = is_homepage
try:
parsed = util.fetch_mf2(url, gateway=gateway, metaformats=metaformats,
require_backlink=require_backlink)
except ValueError as e:
error(str(e))
if parsed is None or not parsed.get('items'):
if parsed:
# we got valid HTML. save the Object so that we know this URL is web
obj.source_protocol = 'web'
obj.put()
logger.info(f'No microformats2 found in {url}')
return False
# find mf2 item
if is_homepage:
logger.info(f"{url} is user's web url")
parsed_url = (parsed['url'] or '').rstrip('/')
# try both with and without trailing slash
entry = (mf2util.representative_hcard(parsed, parsed_url)
or mf2util.representative_hcard(parsed, parsed_url + '/'))
if not entry:
error(f"Couldn't find a representative h-card (http://microformats.org/wiki/representative-h-card-parsing) on {parsed['url']}")
logger.info(f'Found representative h-card')
# handle when eg https://user.com/ redirects to https://www.user.com/
# we need to store this as https://user.com/
if parsed['url'] != url:
logger.info(f'overriding {parsed["url"]} with {url}')
entry['properties']['url'] = [url]
if rel_url := parsed['rel-urls'].pop(parsed['url'], None):
parsed['rel-urls'][url] = rel_url
parsed['url'] = url
else:
entry = mf2util.find_first_entry(parsed, ['h-entry'])
if not entry:
error(f'No microformats2 h-entry found in {url}')
# discard uid if set; we use URL as id
props = entry.setdefault('properties', {})
if 'uid' in props:
logger.info(f'Discarding uid property: {props["uid"]}')
props.pop('uid')
# store final URL in mf2 object
if is_homepage:
entry.setdefault('rel-urls', {}).update(parsed.get('rel-urls', {}))
entry.setdefault('type', ['h-card'])
if parsed['url']:
entry['url'] = parsed['url']
logger.info(f'Extracted microformats2 entry: {json_dumps(entry)[:500]}')
if not is_homepage:
# default actor/author to home page URL
authors = props.setdefault('author', [])
if not microformats2.get_string_urls(authors):
homepage = urljoin(parsed.get('url') or url, '/')
logger.info(f'Defaulting author URL to {homepage}')
if authors and isinstance(authors[0], dict):
authors[0]['properties']['url'] = [homepage]
else:
authors.insert(0, homepage)
# run full authorship algorithm if necessary:
# https://indieweb.org/authorship
# duplicated in microformats2.json_to_object
author = util.get_first(props, 'author')
if not isinstance(author, dict):
logger.info(f'Fetching full authorship for author {author}')
fetch_fn = util.fetch_mf2 if authorship_fetch_mf2 else None
try:
author = mf2util.find_author({'items': [entry]}, hentry=entry,
fetch_mf2_func=fetch_fn)
except (ValueError, TypeError) as e:
logger.warning(e)
author = None
logger.debug(f'Got: {author}')
if author:
props['author'] = util.trim_nulls([{
"type": ["h-card"],
'properties': {
field: [author[field]] if author.get(field) else []
for field in ('name', 'photo', 'url')
},
}])
obj.mf2 = entry
return True
@classmethod
def _convert(cls, obj, from_user=None):
"""Converts a :class:`Object` to HTML.
Args:
obj (models.Object)
from_user (models.User): user (actor) this activity/object is from
Returns:
str:
"""
if not obj or not obj.as1:
return ''
obj_as1 = obj.as1
if from_user and not from_user.is_enabled(cls):
error(f'{from_user.key.id()} => {cls.LABEL} not enabled')
from_proto = PROTOCOLS.get(obj.source_protocol)
if from_proto:
# fill in author/actor if available
for field in 'author', 'actor':
val = as1.get_object(obj.as1, field)
if val.keys() == set(['id']) and val['id']:
loaded = from_proto.load(val['id'], raise_=False)
if loaded and loaded.as1:
obj_as1 = {**obj_as1, field: loaded.as1}
else:
logger.debug(f'Not hydrating actor or author due to source_protocol {obj.source_protocol}')
html = microformats2.activities_to_html([cls.translate_ids(obj_as1)])
# add HTML meta redirect to source page. should trigger for end users in
# browsers but not for webmention receivers (hopefully).
url = util.get_url(obj_as1) or obj_as1.get('id') or obj.key.id()
if util.is_web(url):
utf8 = '<meta charset="utf-8">'
refresh = f'<meta http-equiv="refresh" content="0;url={url}">'
html = html.replace(utf8, utf8 + '\n' + refresh)
return html
@app.get('/web-site')
@flask_util.headers(CACHE_CONTROL)
def enter_web_site():
return render_template('enter_web_site.html')
@app.post('/web-site')
def check_web_site():
logger.info(f'Params: {list(request.form.items())}')
url = request.values['url']
# this normalizes and lower cases domain
try:
domain = normalize_user_id(id=url, proto=Web)
except (ValueError, AssertionError):
logger.info(f'bad web id? {url}', exc_info=True)
domain = None
invalid_msg = util.linkify(f'{url} is not a <a href="/docs#web-get-started">valid or supported web site</a>', pretty=True)
if not domain or not is_valid_domain(domain, allow_internal=False):
flash(invalid_msg)
return render_template('enter_web_site.html'), 400
if util.is_web(url) and urlparse(url).path.strip('/'):
flash('Only top-level web sites and domains are supported.')
return render_template('enter_web_site.html'), 400
try:
user = Web.get_or_create(domain, enabled_protocols=['atproto'],
propagate=True, reload=True, verify=True)
except BaseException as e:
code, body = util.interpret_http_exception(e)
if code:
flash(util.linkify(f"Couldn't connect to {url}: {e}", pretty=True))
return render_template('enter_web_site.html')
raise
if not user: # opted out
flash(invalid_msg)
return render_template('enter_web_site.html'), 400
user.put()
if user.redirects_error == OWNS_WEBFINGER:
flash(f'{url} looks like a fediverse server! Try a normal web site.')
return render_template('enter_web_site.html'), 400
common.create_task(queue='poll-feed', domain=domain)
return redirect(user.user_page_path())
@app.post('/webmention')
def webmention_external():
"""Handles inbound webmention, enqueue task to process.
Use a task queue to deliver to followers because we send to each inbox in
serial, which can take a long time with many followers/instances.
"""
logger.info(f'Params: {list(request.form.items())}')
source = flask_util.get_required_param('source').strip()
if Web.owns_id(source) is False:
error(f'Bad URL {source}')
elif urlparse(source).scheme != 'https':
error('source URLs must be https (with SSL)')
domain = domain_from_link(source, minimize=False)
if not domain:
error(f'Bad source URL {source}')
user = Web.get_by_id(domain)
if not user:
error(f'No user found for domain {domain}')
user.last_webmention_in = util.now()
user.put()
return common.create_task('webmention', **request.form)
def poll_feed(user, feed_url, rel_type):
"""Fetches a :class:`Web` site's feed and delivers new/updated posts.
Args:
user (Web)
feed_url (str)
rel_type (str): feed link's top-level rel type in home page HTML, usually
either ``atom`` or ``rss`
Returns:
list of dict AS1 activities:
"""
user.last_polled_feed = util.now()
# fetch feed
headers = {}
if user.feed_etag:
headers['If-None-Match'] = user.feed_etag
if user.feed_last_modified:
headers['If-Modified-Since'] = user.feed_last_modified
resp = util.requests_get(feed_url, headers=headers, gateway=True)
# update user
user.feed_etag = resp.headers.get('ETag')
user.feed_last_modified = resp.headers.get('Last-Modified')
# parse feed
content_type = resp.headers.get('Content-Type') or ''
type = FEED_TYPES.get(content_type.split(';')[0])
if resp.status_code == 304:
logger.info('Feed is unchanged since last poll')
user.put()
return []
elif type == 'atom' or (type == 'xml' and rel_type == 'atom'):
activities = atom.atom_to_activities(resp.text)
obj_feed_prop = {'atom': resp.text[:MAX_FEED_PROPERTY_SIZE]}
elif type == 'rss' or (type == 'xml' and rel_type == 'rss'):
activities = rss.to_activities(resp.text)
obj_feed_prop = {'rss': resp.text[:MAX_FEED_PROPERTY_SIZE]}
else:
raise ValueError(f'Unknown feed type {content_type}')
if len(activities) > MAX_FEED_ITEMS_PER_POLL:
logger.info(f'Got {len(activities)} feed items, only processing the first {MAX_FEED_ITEMS_PER_POLL}')
activities = activities[:MAX_FEED_ITEMS_PER_POLL]
# create receive tasks
for i, activity in enumerate(activities):
# default actor and author to user
activity.setdefault('actor', {}).setdefault('id', user.profile_id())
obj = activity.setdefault('object', {})
obj.setdefault('author', {}).setdefault('id', user.profile_id())
# use URL as id since some feeds use non-URL (eg tag URI) ids
for elem in obj, activity:
if url := elem.get('url'):
elem['id'] = elem['url']
logger.debug(f'Converted to AS1: {json_dumps(activity, indent=2)}')
id = Object(our_as1=activity).as1.get('id')
if not id:
logger.warning('No id or URL!')
continue
if i == 0:
logger.info(f'Setting feed_last_item to {id}')
user.feed_last_item = id
elif id == user.feed_last_item:
logger.info(f'Already seen {id}, skipping rest of feed')
break
if Web.owns_id(id) is False:
logger.warning(f'Skipping bad id {id}')
continue
if not obj.get('image'):
# fetch and check the post itself
logger.info(f'No image in {id} , trying metaformats')
post = Object(id=id)
try:
fetched = Web.fetch(post, metaformats=True, authorship_fetch_mf2=False)
except (RequestException, HTTPException):
fetched = False
if fetched and post.as1:
profile_images = (as1.get_ids(user.obj.as1, 'image')
if user.obj.as1 else [])
obj['image'] = [img for img in as1.get_ids(post.as1, 'image')
if img not in profile_images]
activity['feed_index'] = i
common.create_task(queue='receive', id=id, our_as1=activity,
source_protocol=Web.ABBREV, authed_as=user.key.id(),
received_at=util.now().isoformat(), **obj_feed_prop)
return activities
@app.post(f'/queue/poll-feed')
@cloud_tasks_only(log=None)
def poll_feed_task():
"""Task handler for polling a :class:`Web` user's feed.
Params:
``domain`` (str): key id of the :class:`Web` user
``last_polled`` (str): should match the user's ``last_polled_feed``. Used
to detect duplicate poll tasks for the same user.
"""
logger.info(f'Params: {request.values}')
domain = flask_util.get_required_param('domain')
logger.info(f'Polling feed for {domain}')
user = Web.get_by_id(domain)
if not (user and user.obj and user.obj.mf2):
error(f'No Web user or object found for domain {domain}', status=304)
elif user.last_webmention_in:
logger.info(f'Dropping since last_webmention_in is set')
return 'OK'
logger.info(f'Last poll: {user.last_polled_feed}')
last_polled = request.form.get('last_polled')
if (last_polled and user.last_polled_feed
and last_polled != user.last_polled_feed.isoformat()):
logger.warning('duplicate poll feed task! deferring to other task')
return '', 204
# discover feed URL
url, rel_type = user.feed_url()
if not url:
msg = f"User {user.key.id()} has no feed URL, can't fetch feed"
logger.info(msg)
return msg
# go go go!
activities = []
status = 200
try:
activities = poll_feed(user, url, rel_type)
except (ValueError, ElementTree.ParseError) as e:
logger.error(f"Couldn't parse feed: {e}")
status = 204
except BaseException as e:
code, _ = util.interpret_http_exception(e)
if code or util.is_connection_failure(e):
logger.error(f"Couldn't fetch feed: {e}")
status = 204
else:
raise
user.put()
# determine posting frequency
published_last = None
published_deltas = [] # timedeltas between entry published times
for activity in activities:
try:
published = util.parse_iso8601(activity['object']['published'])
except (KeyError, ValueError):
continue
if published_last:
published_deltas.append(abs(published - published_last))
published_last = published
# create next poll task
def clamp(delay):
return max(min(delay, MAX_FEED_POLL_PERIOD), MIN_FEED_POLL_PERIOD)
if published_deltas:
delay = clamp(timedelta(seconds=statistics.mean(
t.total_seconds() for t in published_deltas)))
else:
delay = clamp(util.now() -
(user.last_polled_feed if user.last_polled_feed and activities
else user.created.replace(tzinfo=timezone.utc)))
common.create_task(queue='poll-feed', delay=delay, domain=user.key.id(),
last_polled=user.last_polled_feed.isoformat())
return 'OK', status
@app.post('/queue/webmention')
@cloud_tasks_only(log=None)
def webmention_task():
"""Handles inbound webmention task.
Params:
``source`` (str): URL
"""
logger.info(f'Params: {list(request.form.items())}')
# load user
source = flask_util.get_required_param('source').strip()
domain = domain_from_link(source, minimize=False)
logger.info(f'webmention from {domain}')
if domain in common.DOMAINS:
error(f'URL not supported: {source}')
user = Web.get_by_id(domain)
if not user:
error(f'No user found for domain {domain}', status=304)
logger.info(f'User: {user.key.id()}')
# fetch source page
try:
# remote=True to force fetch, local=True to populate new/changed attrs
obj = Web.load(source, local=True, remote=True,
check_backlink=not appengine_info.LOCAL_SERVER)
except BadRequest as e:
error(str(e.description), status=304)
except RequestException as e:
code, body = util.interpret_http_exception(e)
if code not in ('410', '404') or user.is_web_url(source):
error(f'{e} ; {e.response.text if e.response else ""}', status=502)
create_id = f'{source}#bridgy-fed-create'
logger.info(f'Interpreting as Delete. Looking for {create_id}')
create = Object.get_by_id(create_id)
if not create:
error(f"Bridgy Fed hasn't published {source}", status=304)
id = f'{source}#bridgy-fed-delete'
obj = Object(id=id, our_as1={
'id': id,
'objectType': 'activity',
'verb': 'delete',
'actor': user.web_url(),
'object': source,
})
if not obj or (not obj.mf2 and obj.type != 'delete'):
error(f"Couldn't load {source} as microformats2 HTML", status=304)
elif obj.mf2 and 'h-entry' in obj.mf2.get('type', []):
authors = obj.mf2['properties'].setdefault('author', [])
author_urls = microformats2.get_string_urls(authors)
if not author_urls: