15
15
"""
16
16
17
17
__author__ = "Leonard Richardson (leonardr@segfault.org)"
18
- __version__ = "4.9.0 "
18
+ __version__ = "4.9.3 "
19
19
__copyright__ = "Copyright (c) 2004-2020 Leonard Richardson"
20
20
# Use of this source code is governed by the MIT license.
21
21
__license__ = "MIT"
22
22
23
23
__all__ = ['BeautifulSoup' ]
24
24
25
+ from collections import Counter
25
26
import os
26
27
import re
27
28
import sys
39
40
NavigableString ,
40
41
PageElement ,
41
42
ProcessingInstruction ,
43
+ PYTHON_SPECIFIC_ENCODINGS ,
42
44
ResultSet ,
45
+ Script ,
46
+ Stylesheet ,
43
47
SoupStrainer ,
44
48
Tag ,
49
+ TemplateString ,
45
50
)
46
51
47
52
# The very first thing we do is give a useful error if someone is
48
53
# running this code under Python 3 without converting it.
49
54
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.' <> 'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
50
55
56
+ # Define some custom warnings.
57
+ class GuessedAtParserWarning (UserWarning ):
58
+ """The warning issued when BeautifulSoup has to guess what parser to
59
+ use -- probably because no parser was specified in the constructor.
60
+ """
61
+
62
+ class MarkupResemblesLocatorWarning (UserWarning ):
63
+ """The warning issued when BeautifulSoup is given 'markup' that
64
+ actually looks like a resource locator -- a URL or a path to a file
65
+ on disk.
66
+ """
67
+
68
+
51
69
class BeautifulSoup (Tag ):
52
70
"""A data structure representing a parsed HTML or XML document.
53
71
@@ -93,7 +111,7 @@ class BeautifulSoup(Tag):
93
111
ASCII_SPACES = '\x20 \x0a \x09 \x0c \x0d '
94
112
95
113
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\" %(parser)s\" ). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n \n The code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\" %(parser)s\" ' to the BeautifulSoup constructor.\n "
96
-
114
+
97
115
def __init__ (self , markup = "" , features = None , builder = None ,
98
116
parse_only = None , from_encoding = None , exclude_encodings = None ,
99
117
element_classes = None , ** kwargs ):
@@ -235,7 +253,9 @@ def deprecated_argument(old_name, new_name):
235
253
if not original_builder and not (
236
254
original_features == builder .NAME or
237
255
original_features in builder .ALTERNATE_NAMES
238
- ):
256
+ ) and markup :
257
+ # The user did not tell us which TreeBuilder to use,
258
+ # and we had to guess. Issue a warning.
239
259
if builder .is_xml :
240
260
markup_type = "XML"
241
261
else :
@@ -269,7 +289,10 @@ def deprecated_argument(old_name, new_name):
269
289
parser = builder .NAME ,
270
290
markup_type = markup_type
271
291
)
272
- warnings .warn (self .NO_PARSER_SPECIFIED_WARNING % values , stacklevel = 2 )
292
+ warnings .warn (
293
+ self .NO_PARSER_SPECIFIED_WARNING % values ,
294
+ GuessedAtParserWarning , stacklevel = 2
295
+ )
273
296
else :
274
297
if kwargs :
275
298
warnings .warn ("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`." )
@@ -309,7 +332,8 @@ def deprecated_argument(old_name, new_name):
309
332
warnings .warn (
310
333
'"%s" looks like a filename, not markup. You should'
311
334
' probably open this file and pass the filehandle into'
312
- ' Beautiful Soup.' % self ._decode_markup (markup )
335
+ ' Beautiful Soup.' % self ._decode_markup (markup ),
336
+ MarkupResemblesLocatorWarning
313
337
)
314
338
self ._check_markup_is_url (markup )
315
339
@@ -396,7 +420,8 @@ def _check_markup_is_url(cls, markup):
396
420
' requests to get the document behind the URL, and feed'
397
421
' that document to Beautiful Soup.' % cls ._decode_markup (
398
422
markup
399
- )
423
+ ),
424
+ MarkupResemblesLocatorWarning
400
425
)
401
426
402
427
def _feed (self ):
@@ -422,13 +447,28 @@ def reset(self):
422
447
self .current_data = []
423
448
self .currentTag = None
424
449
self .tagStack = []
450
+ self .open_tag_counter = Counter ()
425
451
self .preserve_whitespace_tag_stack = []
426
452
self .string_container_stack = []
427
453
self .pushTag (self )
428
454
429
455
def new_tag (self , name , namespace = None , nsprefix = None , attrs = {},
430
456
sourceline = None , sourcepos = None , ** kwattrs ):
431
- """Create a new Tag associated with this BeautifulSoup object."""
457
+ """Create a new Tag associated with this BeautifulSoup object.
458
+
459
+ :param name: The name of the new Tag.
460
+ :param namespace: The URI of the new Tag's XML namespace, if any.
461
+ :param prefix: The prefix for the new Tag's XML namespace, if any.
462
+ :param attrs: A dictionary of this Tag's attribute values; can
463
+ be used instead of `kwattrs` for attributes like 'class'
464
+ that are reserved words in Python.
465
+ :param sourceline: The line number where this tag was
466
+ (purportedly) found in its source document.
467
+ :param sourcepos: The character position within `sourceline` where this
468
+ tag was (purportedly) found.
469
+ :param kwattrs: Keyword arguments for the new Tag's attribute values.
470
+
471
+ """
432
472
kwattrs .update (attrs )
433
473
return self .element_classes .get (Tag , Tag )(
434
474
None , self .builder , name , namespace , nsprefix , kwattrs ,
@@ -458,13 +498,13 @@ def new_string(self, s, subclass=None):
458
498
container = self .string_container (subclass )
459
499
return container (s )
460
500
461
- def insert_before (self , successor ):
501
+ def insert_before (self , * args ):
462
502
"""This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
463
503
it because there is nothing before or after it in the parse tree.
464
504
"""
465
505
raise NotImplementedError ("BeautifulSoup objects don't support insert_before()." )
466
506
467
- def insert_after (self , successor ):
507
+ def insert_after (self , * args ):
468
508
"""This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
469
509
it because there is nothing before or after it in the parse tree.
470
510
"""
@@ -473,22 +513,26 @@ def insert_after(self, successor):
473
513
def popTag (self ):
474
514
"""Internal method called by _popToTag when a tag is closed."""
475
515
tag = self .tagStack .pop ()
516
+ if tag .name in self .open_tag_counter :
517
+ self .open_tag_counter [tag .name ] -= 1
476
518
if self .preserve_whitespace_tag_stack and tag == self .preserve_whitespace_tag_stack [- 1 ]:
477
519
self .preserve_whitespace_tag_stack .pop ()
478
520
if self .string_container_stack and tag == self .string_container_stack [- 1 ]:
479
521
self .string_container_stack .pop ()
480
- #print "Pop", tag.name
522
+ #print( "Pop", tag.name)
481
523
if self .tagStack :
482
524
self .currentTag = self .tagStack [- 1 ]
483
525
return self .currentTag
484
526
485
527
def pushTag (self , tag ):
486
528
"""Internal method called by handle_starttag when a tag is opened."""
487
- #print "Push", tag.name
529
+ #print( "Push", tag.name)
488
530
if self .currentTag is not None :
489
531
self .currentTag .contents .append (tag )
490
532
self .tagStack .append (tag )
491
533
self .currentTag = self .tagStack [- 1 ]
534
+ if tag .name != self .ROOT_TAG_NAME :
535
+ self .open_tag_counter [tag .name ] += 1
492
536
if tag .name in self .builder .preserve_whitespace_tags :
493
537
self .preserve_whitespace_tag_stack .append (tag )
494
538
if tag .name in self .builder .string_containers :
@@ -599,15 +643,19 @@ def _linkage_fixer(self, el):
599
643
600
644
def _popToTag (self , name , nsprefix = None , inclusivePop = True ):
601
645
"""Pops the tag stack up to and including the most recent
602
- instance of the given tag.
646
+ instance of the given tag.
647
+
648
+ If there are no open tags with the given name, nothing will be
649
+ popped.
603
650
604
651
:param name: Pop up to the most recent tag with this name.
605
652
:param nsprefix: The namespace prefix that goes with `name`.
606
653
:param inclusivePop: It this is false, pops the tag stack up
607
654
to but *not* including the most recent instqance of the
608
655
given tag.
656
+
609
657
"""
610
- #print "Popping to %s" % name
658
+ #print( "Popping to %s" % name)
611
659
if name == self .ROOT_TAG_NAME :
612
660
# The BeautifulSoup object itself can never be popped.
613
661
return
@@ -616,6 +664,8 @@ def _popToTag(self, name, nsprefix=None, inclusivePop=True):
616
664
617
665
stack_size = len (self .tagStack )
618
666
for i in range (stack_size - 1 , 0 , - 1 ):
667
+ if not self .open_tag_counter .get (name ):
668
+ break
619
669
t = self .tagStack [i ]
620
670
if (name == t .name and nsprefix == t .prefix ):
621
671
if inclusivePop :
@@ -642,7 +692,7 @@ def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
642
692
in the document. For instance, if this was a self-closing tag,
643
693
don't call handle_endtag.
644
694
"""
645
- # print "Start tag %s: %s" % (name, attrs)
695
+ # print( "Start tag %s: %s" % (name, attrs) )
646
696
self .endData ()
647
697
648
698
if (self .parse_only and len (self .tagStack ) <= 1
@@ -669,14 +719,14 @@ def handle_endtag(self, name, nsprefix=None):
669
719
:param name: Name of the tag.
670
720
:param nsprefix: Namespace prefix for the tag.
671
721
"""
672
- #print "End tag: " + name
722
+ #print( "End tag: " + name)
673
723
self .endData ()
674
724
self ._popToTag (name , nsprefix )
675
725
676
726
def handle_data (self , data ):
677
727
"""Called by the tree builder when a chunk of textual data is encountered."""
678
728
self .current_data .append (data )
679
-
729
+
680
730
def decode (self , pretty_print = False ,
681
731
eventual_encoding = DEFAULT_OUTPUT_ENCODING ,
682
732
formatter = "minimal" ):
@@ -691,6 +741,11 @@ def decode(self, pretty_print=False,
691
741
if self .is_xml :
692
742
# Print the XML declaration
693
743
encoding_part = ''
744
+ if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS :
745
+ # This is a special Python encoding; it can't actually
746
+ # go into an XML document because it means nothing
747
+ # outside of Python.
748
+ eventual_encoding = None
694
749
if eventual_encoding != None :
695
750
encoding_part = ' encoding="%s"' % eventual_encoding
696
751
prefix = u'<?xml version="1.0"%s?>\n ' % encoding_part
@@ -733,4 +788,4 @@ class FeatureNotFound(ValueError):
733
788
if __name__ == '__main__' :
734
789
import sys
735
790
soup = BeautifulSoup (sys .stdin )
736
- print soup .prettify ()
791
+ print ( soup .prettify () )
0 commit comments