Add modified version of BeautifulSoup4 (nth-of-type pseudoselector and full-featured direct descendant support)
parent
8e951f6b27
commit
d3bd59f813
@ -0,0 +1,361 @@
|
|||||||
|
"""Beautiful Soup
|
||||||
|
Elixir and Tonic
|
||||||
|
"The Screen-Scraper's Friend"
|
||||||
|
http://www.crummy.com/software/BeautifulSoup/
|
||||||
|
|
||||||
|
Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
||||||
|
(possibly invalid) document into a tree representation. Beautiful Soup
|
||||||
|
provides provides methods and Pythonic idioms that make it easy to
|
||||||
|
navigate, search, and modify the parse tree.
|
||||||
|
|
||||||
|
Beautiful Soup works with Python 2.6 and up. It works better if lxml
|
||||||
|
and/or html5lib is installed.
|
||||||
|
|
||||||
|
For more than you ever wanted to know about Beautiful Soup, see the
|
||||||
|
documentation:
|
||||||
|
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||||
|
"""
|
||||||
|
|
||||||
|
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||||
|
__version__ = "4.1.3"
|
||||||
|
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
|
||||||
|
__license__ = "MIT"
|
||||||
|
|
||||||
|
__all__ = ['BeautifulSoup']
|
||||||
|
|
||||||
|
import re
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from .builder import builder_registry
|
||||||
|
from .dammit import UnicodeDammit
|
||||||
|
from .element import (
|
||||||
|
CData,
|
||||||
|
Comment,
|
||||||
|
DEFAULT_OUTPUT_ENCODING,
|
||||||
|
Declaration,
|
||||||
|
Doctype,
|
||||||
|
NavigableString,
|
||||||
|
PageElement,
|
||||||
|
ProcessingInstruction,
|
||||||
|
ResultSet,
|
||||||
|
SoupStrainer,
|
||||||
|
Tag,
|
||||||
|
)
|
||||||
|
|
||||||
|
# The very first thing we do is give a useful error if someone is
|
||||||
|
# running this code under Python 3 without converting it.
|
||||||
|
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||||
|
|
||||||
|
class BeautifulSoup(Tag):
|
||||||
|
"""
|
||||||
|
This class defines the basic interface called by the tree builders.
|
||||||
|
|
||||||
|
These methods will be called by the parser:
|
||||||
|
reset()
|
||||||
|
feed(markup)
|
||||||
|
|
||||||
|
The tree builder may call these methods from its feed() implementation:
|
||||||
|
handle_starttag(name, attrs) # See note about return value
|
||||||
|
handle_endtag(name)
|
||||||
|
handle_data(data) # Appends to the current data node
|
||||||
|
endData(containerClass=NavigableString) # Ends the current data node
|
||||||
|
|
||||||
|
No matter how complicated the underlying parser is, you should be
|
||||||
|
able to build a tree using 'start tag' events, 'end tag' events,
|
||||||
|
'data' events, and "done with data" events.
|
||||||
|
|
||||||
|
If you encounter an empty-element tag (aka a self-closing tag,
|
||||||
|
like HTML's <br> tag), call handle_starttag and then
|
||||||
|
handle_endtag.
|
||||||
|
"""
|
||||||
|
ROOT_TAG_NAME = u'[document]'
|
||||||
|
|
||||||
|
# If the end-user gives no indication which tree builder they
|
||||||
|
# want, look for one with these features.
|
||||||
|
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
|
||||||
|
|
||||||
|
# Used when determining whether a text node is all whitespace and
|
||||||
|
# can be replaced with a single space. A text node that contains
|
||||||
|
# fancy Unicode spaces (usually non-breaking) should be left
|
||||||
|
# alone.
|
||||||
|
STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
|
||||||
|
|
||||||
|
def __init__(self, markup="", features=None, builder=None,
|
||||||
|
parse_only=None, from_encoding=None, **kwargs):
|
||||||
|
"""The Soup object is initialized as the 'root tag', and the
|
||||||
|
provided markup (which can be a string or a file-like object)
|
||||||
|
is fed into the underlying parser."""
|
||||||
|
|
||||||
|
if 'convertEntities' in kwargs:
|
||||||
|
warnings.warn(
|
||||||
|
"BS4 does not respect the convertEntities argument to the "
|
||||||
|
"BeautifulSoup constructor. Entities are always converted "
|
||||||
|
"to Unicode characters.")
|
||||||
|
|
||||||
|
if 'markupMassage' in kwargs:
|
||||||
|
del kwargs['markupMassage']
|
||||||
|
warnings.warn(
|
||||||
|
"BS4 does not respect the markupMassage argument to the "
|
||||||
|
"BeautifulSoup constructor. The tree builder is responsible "
|
||||||
|
"for any necessary markup massage.")
|
||||||
|
|
||||||
|
if 'smartQuotesTo' in kwargs:
|
||||||
|
del kwargs['smartQuotesTo']
|
||||||
|
warnings.warn(
|
||||||
|
"BS4 does not respect the smartQuotesTo argument to the "
|
||||||
|
"BeautifulSoup constructor. Smart quotes are always converted "
|
||||||
|
"to Unicode characters.")
|
||||||
|
|
||||||
|
if 'selfClosingTags' in kwargs:
|
||||||
|
del kwargs['selfClosingTags']
|
||||||
|
warnings.warn(
|
||||||
|
"BS4 does not respect the selfClosingTags argument to the "
|
||||||
|
"BeautifulSoup constructor. The tree builder is responsible "
|
||||||
|
"for understanding self-closing tags.")
|
||||||
|
|
||||||
|
if 'isHTML' in kwargs:
|
||||||
|
del kwargs['isHTML']
|
||||||
|
warnings.warn(
|
||||||
|
"BS4 does not respect the isHTML argument to the "
|
||||||
|
"BeautifulSoup constructor. You can pass in features='html' "
|
||||||
|
"or features='xml' to get a builder capable of handling "
|
||||||
|
"one or the other.")
|
||||||
|
|
||||||
|
def deprecated_argument(old_name, new_name):
|
||||||
|
if old_name in kwargs:
|
||||||
|
warnings.warn(
|
||||||
|
'The "%s" argument to the BeautifulSoup constructor '
|
||||||
|
'has been renamed to "%s."' % (old_name, new_name))
|
||||||
|
value = kwargs[old_name]
|
||||||
|
del kwargs[old_name]
|
||||||
|
return value
|
||||||
|
return None
|
||||||
|
|
||||||
|
parse_only = parse_only or deprecated_argument(
|
||||||
|
"parseOnlyThese", "parse_only")
|
||||||
|
|
||||||
|
from_encoding = from_encoding or deprecated_argument(
|
||||||
|
"fromEncoding", "from_encoding")
|
||||||
|
|
||||||
|
if len(kwargs) > 0:
|
||||||
|
arg = kwargs.keys().pop()
|
||||||
|
raise TypeError(
|
||||||
|
"__init__() got an unexpected keyword argument '%s'" % arg)
|
||||||
|
|
||||||
|
if builder is None:
|
||||||
|
if isinstance(features, basestring):
|
||||||
|
features = [features]
|
||||||
|
if features is None or len(features) == 0:
|
||||||
|
features = self.DEFAULT_BUILDER_FEATURES
|
||||||
|
builder_class = builder_registry.lookup(*features)
|
||||||
|
if builder_class is None:
|
||||||
|
raise FeatureNotFound(
|
||||||
|
"Couldn't find a tree builder with the features you "
|
||||||
|
"requested: %s. Do you need to install a parser library?"
|
||||||
|
% ",".join(features))
|
||||||
|
builder = builder_class()
|
||||||
|
self.builder = builder
|
||||||
|
self.is_xml = builder.is_xml
|
||||||
|
self.builder.soup = self
|
||||||
|
|
||||||
|
self.parse_only = parse_only
|
||||||
|
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
if hasattr(markup, 'read'): # It's a file-type object.
|
||||||
|
markup = markup.read()
|
||||||
|
(self.markup, self.original_encoding, self.declared_html_encoding,
|
||||||
|
self.contains_replacement_characters) = (
|
||||||
|
self.builder.prepare_markup(markup, from_encoding))
|
||||||
|
|
||||||
|
try:
|
||||||
|
self._feed()
|
||||||
|
except StopParsing:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Clear out the markup and remove the builder's circular
|
||||||
|
# reference to this object.
|
||||||
|
self.markup = None
|
||||||
|
self.builder.soup = None
|
||||||
|
|
||||||
|
def _feed(self):
|
||||||
|
# Convert the document to Unicode.
|
||||||
|
self.builder.reset()
|
||||||
|
|
||||||
|
self.builder.feed(self.markup)
|
||||||
|
# Close out any unfinished strings and close all the open tags.
|
||||||
|
self.endData()
|
||||||
|
while self.currentTag.name != self.ROOT_TAG_NAME:
|
||||||
|
self.popTag()
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
|
||||||
|
self.hidden = 1
|
||||||
|
self.builder.reset()
|
||||||
|
self.currentData = []
|
||||||
|
self.currentTag = None
|
||||||
|
self.tagStack = []
|
||||||
|
self.pushTag(self)
|
||||||
|
|
||||||
|
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
|
||||||
|
"""Create a new tag associated with this soup."""
|
||||||
|
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
|
||||||
|
|
||||||
|
def new_string(self, s):
|
||||||
|
"""Create a new NavigableString associated with this soup."""
|
||||||
|
navigable = NavigableString(s)
|
||||||
|
navigable.setup()
|
||||||
|
return navigable
|
||||||
|
|
||||||
|
def insert_before(self, successor):
|
||||||
|
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
|
||||||
|
|
||||||
|
def insert_after(self, successor):
|
||||||
|
raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
|
||||||
|
|
||||||
|
def popTag(self):
|
||||||
|
tag = self.tagStack.pop()
|
||||||
|
#print "Pop", tag.name
|
||||||
|
if self.tagStack:
|
||||||
|
self.currentTag = self.tagStack[-1]
|
||||||
|
return self.currentTag
|
||||||
|
|
||||||
|
def pushTag(self, tag):
|
||||||
|
#print "Push", tag.name
|
||||||
|
if self.currentTag:
|
||||||
|
self.currentTag.contents.append(tag)
|
||||||
|
self.tagStack.append(tag)
|
||||||
|
self.currentTag = self.tagStack[-1]
|
||||||
|
|
||||||
|
def endData(self, containerClass=NavigableString):
|
||||||
|
if self.currentData:
|
||||||
|
currentData = u''.join(self.currentData)
|
||||||
|
if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
|
||||||
|
not set([tag.name for tag in self.tagStack]).intersection(
|
||||||
|
self.builder.preserve_whitespace_tags)):
|
||||||
|
if '\n' in currentData:
|
||||||
|
currentData = '\n'
|
||||||
|
else:
|
||||||
|
currentData = ' '
|
||||||
|
self.currentData = []
|
||||||
|
if self.parse_only and len(self.tagStack) <= 1 and \
|
||||||
|
(not self.parse_only.text or \
|
||||||
|
not self.parse_only.search(currentData)):
|
||||||
|
return
|
||||||
|
o = containerClass(currentData)
|
||||||
|
self.object_was_parsed(o)
|
||||||
|
|
||||||
|
def object_was_parsed(self, o, parent=None, previous_element=None):
|
||||||
|
"""Add an object to the parse tree."""
|
||||||
|
parent = parent or self.currentTag
|
||||||
|
previous_element = previous_element or self.previous_element
|
||||||
|
o.setup(parent, previous_element)
|
||||||
|
if self.previous_element:
|
||||||
|
self.previous_element.next_element = o
|
||||||
|
self.previous_element = o
|
||||||
|
parent.contents.append(o)
|
||||||
|
|
||||||
|
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
||||||
|
"""Pops the tag stack up to and including the most recent
|
||||||
|
instance of the given tag. If inclusivePop is false, pops the tag
|
||||||
|
stack up to but *not* including the most recent instqance of
|
||||||
|
the given tag."""
|
||||||
|
#print "Popping to %s" % name
|
||||||
|
if name == self.ROOT_TAG_NAME:
|
||||||
|
return
|
||||||
|
|
||||||
|
numPops = 0
|
||||||
|
mostRecentTag = None
|
||||||
|
|
||||||
|
for i in range(len(self.tagStack) - 1, 0, -1):
|
||||||
|
if (name == self.tagStack[i].name
|
||||||
|
and nsprefix == self.tagStack[i].prefix):
|
||||||
|
numPops = len(self.tagStack) - i
|
||||||
|
break
|
||||||
|
if not inclusivePop:
|
||||||
|
numPops = numPops - 1
|
||||||
|
|
||||||
|
for i in range(0, numPops):
|
||||||
|
mostRecentTag = self.popTag()
|
||||||
|
return mostRecentTag
|
||||||
|
|
||||||
|
def handle_starttag(self, name, namespace, nsprefix, attrs):
|
||||||
|
"""Push a start tag on to the stack.
|
||||||
|
|
||||||
|
If this method returns None, the tag was rejected by the
|
||||||
|
SoupStrainer. You should proceed as if the tag had not occured
|
||||||
|
in the document. For instance, if this was a self-closing tag,
|
||||||
|
don't call handle_endtag.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# print "Start tag %s: %s" % (name, attrs)
|
||||||
|
self.endData()
|
||||||
|
|
||||||
|
if (self.parse_only and len(self.tagStack) <= 1
|
||||||
|
and (self.parse_only.text
|
||||||
|
or not self.parse_only.search_tag(name, attrs))):
|
||||||
|
return None
|
||||||
|
|
||||||
|
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
|
||||||
|
self.currentTag, self.previous_element)
|
||||||
|
if tag is None:
|
||||||
|
return tag
|
||||||
|
if self.previous_element:
|
||||||
|
self.previous_element.next_element = tag
|
||||||
|
self.previous_element = tag
|
||||||
|
self.pushTag(tag)
|
||||||
|
return tag
|
||||||
|
|
||||||
|
def handle_endtag(self, name, nsprefix=None):
|
||||||
|
#print "End tag: " + name
|
||||||
|
self.endData()
|
||||||
|
self._popToTag(name, nsprefix)
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
self.currentData.append(data)
|
||||||
|
|
||||||
|
def decode(self, pretty_print=False,
|
||||||
|
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
||||||
|
formatter="minimal"):
|
||||||
|
"""Returns a string or Unicode representation of this document.
|
||||||
|
To get Unicode, pass None for encoding."""
|
||||||
|
|
||||||
|
if self.is_xml:
|
||||||
|
# Print the XML declaration
|
||||||
|
encoding_part = ''
|
||||||
|
if eventual_encoding != None:
|
||||||
|
encoding_part = ' encoding="%s"' % eventual_encoding
|
||||||
|
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
|
||||||
|
else:
|
||||||
|
prefix = u''
|
||||||
|
if not pretty_print:
|
||||||
|
indent_level = None
|
||||||
|
else:
|
||||||
|
indent_level = 0
|
||||||
|
return prefix + super(BeautifulSoup, self).decode(
|
||||||
|
indent_level, eventual_encoding, formatter)
|
||||||
|
|
||||||
|
class BeautifulStoneSoup(BeautifulSoup):
|
||||||
|
"""Deprecated interface to an XML parser."""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
kwargs['features'] = 'xml'
|
||||||
|
warnings.warn(
|
||||||
|
'The BeautifulStoneSoup class is deprecated. Instead of using '
|
||||||
|
'it, pass features="xml" into the BeautifulSoup constructor.')
|
||||||
|
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class StopParsing(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class FeatureNotFound(ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
#By default, act as an HTML pretty-printer.
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import sys
|
||||||
|
soup = BeautifulSoup(sys.stdin)
|
||||||
|
print soup.prettify()
|
@ -0,0 +1,361 @@
|
|||||||
|
"""Beautiful Soup
|
||||||
|
Elixir and Tonic
|
||||||
|
"The Screen-Scraper's Friend"
|
||||||
|
http://www.crummy.com/software/BeautifulSoup/
|
||||||
|
|
||||||
|
Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
||||||
|
(possibly invalid) document into a tree representation. Beautiful Soup
|
||||||
|
provides provides methods and Pythonic idioms that make it easy to
|
||||||
|
navigate, search, and modify the parse tree.
|
||||||
|
|
||||||
|
Beautiful Soup works with Python 2.6 and up. It works better if lxml
|
||||||
|
and/or html5lib is installed.
|
||||||
|
|
||||||
|
For more than you ever wanted to know about Beautiful Soup, see the
|
||||||
|
documentation:
|
||||||
|
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||||
|
"""
|
||||||
|
|
||||||
|
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||||
|
__version__ = "4.1.3"
|
||||||
|
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
|
||||||
|
__license__ = "MIT"
|
||||||
|
|
||||||
|
__all__ = ['BeautifulSoup']
|
||||||
|
|
||||||
|
import re
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from .builder import builder_registry
|
||||||
|
from .dammit import UnicodeDammit
|
||||||
|
from .element import (
|
||||||
|
CData,
|
||||||
|
Comment,
|
||||||
|
DEFAULT_OUTPUT_ENCODING,
|
||||||
|
Declaration,
|
||||||
|
Doctype,
|
||||||
|
NavigableString,
|
||||||
|
PageElement,
|
||||||
|
ProcessingInstruction,
|
||||||
|
ResultSet,
|
||||||
|
SoupStrainer,
|
||||||
|
Tag,
|
||||||
|
)
|
||||||
|
|
||||||
|
# The very first thing we do is give a useful error if someone is
|
||||||
|
# running this code under Python 3 without converting it.
|
||||||
|
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||||
|
|
||||||
|
class BeautifulSoup(Tag):
|
||||||
|
"""
|
||||||
|
This class defines the basic interface called by the tree builders.
|
||||||
|
|
||||||
|
These methods will be called by the parser:
|
||||||
|
reset()
|
||||||
|
feed(markup)
|
||||||
|
|
||||||
|
The tree builder may call these methods from its feed() implementation:
|
||||||
|
handle_starttag(name, attrs) # See note about return value
|
||||||
|
handle_endtag(name)
|
||||||
|
handle_data(data) # Appends to the current data node
|
||||||
|
endData(containerClass=NavigableString) # Ends the current data node
|
||||||
|
|
||||||
|
No matter how complicated the underlying parser is, you should be
|
||||||
|
able to build a tree using 'start tag' events, 'end tag' events,
|
||||||
|
'data' events, and "done with data" events.
|
||||||
|
|
||||||
|
If you encounter an empty-element tag (aka a self-closing tag,
|
||||||
|
like HTML's <br> tag), call handle_starttag and then
|
||||||
|
handle_endtag.
|
||||||
|
"""
|
||||||
|
ROOT_TAG_NAME = u'[document]'
|
||||||
|
|
||||||
|
# If the end-user gives no indication which tree builder they
|
||||||
|
# want, look for one with these features.
|
||||||
|
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
|
||||||
|
|
||||||
|
# Used when determining whether a text node is all whitespace and
|
||||||
|
# can be replaced with a single space. A text node that contains
|
||||||
|
# fancy Unicode spaces (usually non-breaking) should be left
|
||||||
|
# alone.
|
||||||
|
STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
|
||||||
|
|
||||||
|
def __init__(self, markup="", features=None, builder=None,
|
||||||
|
parse_only=None, from_encoding=None, **kwargs):
|
||||||
|
"""The Soup object is initialized as the 'root tag', and the
|
||||||
|
provided markup (which can be a string or a file-like object)
|
||||||
|
is fed into the underlying parser."""
|
||||||
|
|
||||||
|
if 'convertEntities' in kwargs:
|
||||||
|
warnings.warn(
|
||||||
|
"BS4 does not respect the convertEntities argument to the "
|
||||||
|
"BeautifulSoup constructor. Entities are always converted "
|
||||||
|
"to Unicode characters.")
|
||||||
|
|
||||||
|
if 'markupMassage' in kwargs:
|
||||||
|
del kwargs['markupMassage']
|
||||||
|
warnings.warn(
|
||||||
|
"BS4 does not respect the markupMassage argument to the "
|
||||||
|
"BeautifulSoup constructor. The tree builder is responsible "
|
||||||
|
"for any necessary markup massage.")
|
||||||
|
|
||||||
|
if 'smartQuotesTo' in kwargs:
|
||||||
|
del kwargs['smartQuotesTo']
|
||||||
|
warnings.warn(
|
||||||
|
"BS4 does not respect the smartQuotesTo argument to the "
|
||||||
|
"BeautifulSoup constructor. Smart quotes are always converted "
|
||||||
|
"to Unicode characters.")
|
||||||
|
|
||||||
|
if 'selfClosingTags' in kwargs:
|
||||||
|
del kwargs['selfClosingTags']
|
||||||
|
warnings.warn(
|
||||||
|
"BS4 does not respect the selfClosingTags argument to the "
|
||||||
|
"BeautifulSoup constructor. The tree builder is responsible "
|
||||||
|
"for understanding self-closing tags.")
|
||||||
|
|
||||||
|
if 'isHTML' in kwargs:
|
||||||
|
del kwargs['isHTML']
|
||||||
|
warnings.warn(
|
||||||
|
"BS4 does not respect the isHTML argument to the "
|
||||||
|
"BeautifulSoup constructor. You can pass in features='html' "
|
||||||
|
"or features='xml' to get a builder capable of handling "
|
||||||
|
"one or the other.")
|
||||||
|
|
||||||
|
def deprecated_argument(old_name, new_name):
|
||||||
|
if old_name in kwargs:
|
||||||
|
warnings.warn(
|
||||||
|
'The "%s" argument to the BeautifulSoup constructor '
|
||||||
|
'has been renamed to "%s."' % (old_name, new_name))
|
||||||
|
value = kwargs[old_name]
|
||||||
|
del kwargs[old_name]
|
||||||
|
return value
|
||||||
|
return None
|
||||||
|
|
||||||
|
parse_only = parse_only or deprecated_argument(
|
||||||
|
"parseOnlyThese", "parse_only")
|
||||||
|
|
||||||
|
from_encoding = from_encoding or deprecated_argument(
|
||||||
|
"fromEncoding", "from_encoding")
|
||||||
|
|
||||||
|
if len(kwargs) > 0:
|
||||||
|
arg = kwargs.keys().pop()
|
||||||
|
raise TypeError(
|
||||||
|
"__init__() got an unexpected keyword argument '%s'" % arg)
|
||||||
|
|
||||||
|
if builder is None:
|
||||||
|
if isinstance(features, basestring):
|
||||||
|
features = [features]
|
||||||
|
if features is None or len(features) == 0:
|
||||||
|
features = self.DEFAULT_BUILDER_FEATURES
|
||||||
|
builder_class = builder_registry.lookup(*features)
|
||||||
|
if builder_class is None:
|
||||||
|
raise FeatureNotFound(
|
||||||
|
"Couldn't find a tree builder with the features you "
|
||||||
|
"requested: %s. Do you need to install a parser library?"
|
||||||
|
% ",".join(features))
|
||||||
|
builder = builder_class()
|
||||||
|
self.builder = builder
|
||||||
|
self.is_xml = builder.is_xml
|
||||||
|
self.builder.soup = self
|
||||||
|
|
||||||
|
self.parse_only = parse_only
|
||||||
|
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
if hasattr(markup, 'read'): # It's a file-type object.
|
||||||
|
markup = markup.read()
|
||||||
|
(self.markup, self.original_encoding, self.declared_html_encoding,
|
||||||
|
self.contains_replacement_characters) = (
|
||||||
|
self.builder.prepare_markup(markup, from_encoding))
|
||||||
|
|
||||||
|
try:
|
||||||
|
self._feed()
|
||||||
|
except StopParsing:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Clear out the markup and remove the builder's circular
|
||||||
|
# reference to this object.
|
||||||
|
self.markup = None
|
||||||
|
self.builder.soup = None
|
||||||
|
|
||||||
|
def _feed(self):
|
||||||
|
# Convert the document to Unicode.
|
||||||
|
self.builder.reset()
|
||||||
|
|
||||||
|
self.builder.feed(self.markup)
|
||||||
|
# Close out any unfinished strings and close all the open tags.
|
||||||
|
self.endData()
|
||||||
|
while self.currentTag.name != self.ROOT_TAG_NAME:
|
||||||
|
self.popTag()
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
|
||||||
|
self.hidden = 1
|
||||||
|
self.builder.reset()
|
||||||
|
self.currentData = []
|
||||||
|
self.currentTag = None
|
||||||
|
self.tagStack = []
|
||||||
|
self.pushTag(self)
|
||||||
|
|
||||||
|
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
|
||||||
|
"""Create a new tag associated with this soup."""
|
||||||
|
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
|
||||||
|
|
||||||
|
def new_string(self, s):
|
||||||
|
"""Create a new NavigableString associated with this soup."""
|
||||||
|
navigable = NavigableString(s)
|
||||||
|
navigable.setup()
|
||||||
|
return navigable
|
||||||
|
|
||||||
|
def insert_before(self, successor):
|
||||||
|
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
|
||||||
|
|
||||||
|
def insert_after(self, successor):
|
||||||
|
raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
|
||||||
|
|
||||||
|
def popTag(self):
|
||||||
|
tag = self.tagStack.pop()
|
||||||
|
#print "Pop", tag.name
|
||||||
|
if self.tagStack:
|
||||||
|
self.currentTag = self.tagStack[-1]
|
||||||
|
return self.currentTag
|
||||||
|
|
||||||
|
def pushTag(self, tag):
|
||||||
|
#print "Push", tag.name
|
||||||
|
if self.currentTag:
|
||||||
|
self.currentTag.contents.append(tag)
|
||||||
|
self.tagStack.append(tag)
|
||||||
|
self.currentTag = self.tagStack[-1]
|
||||||
|
|
||||||
|
def endData(self, containerClass=NavigableString):
|
||||||
|
if self.currentData:
|
||||||
|
currentData = u''.join(self.currentData)
|
||||||
|
if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
|
||||||
|
not set([tag.name for tag in self.tagStack]).intersection(
|
||||||
|
self.builder.preserve_whitespace_tags)):
|
||||||
|
if '\n' in currentData:
|
||||||
|
currentData = '\n'
|
||||||
|
else:
|
||||||
|
currentData = ' '
|
||||||
|
self.currentData = []
|
||||||
|
if self.parse_only and len(self.tagStack) <= 1 and \
|
||||||
|
(not self.parse_only.text or \
|
||||||
|
not self.parse_only.search(currentData)):
|
||||||
|
return
|
||||||
|
o = containerClass(currentData)
|
||||||
|
self.object_was_parsed(o)
|
||||||
|
|
||||||
|
def object_was_parsed(self, o, parent=None, previous_element=None):
|
||||||
|
"""Add an object to the parse tree."""
|
||||||
|
parent = parent or self.currentTag
|
||||||
|
previous_element = previous_element or self.previous_element
|
||||||
|
o.setup(parent, previous_element)
|
||||||
|
if self.previous_element:
|
||||||
|
self.previous_element.next_element = o
|
||||||
|
self.previous_element = o
|
||||||
|
parent.contents.append(o)
|
||||||
|
|
||||||
|
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
||||||
|
"""Pops the tag stack up to and including the most recent
|
||||||
|
instance of the given tag. If inclusivePop is false, pops the tag
|
||||||
|
stack up to but *not* including the most recent instqance of
|
||||||
|
the given tag."""
|
||||||
|
#print "Popping to %s" % name
|
||||||
|
if name == self.ROOT_TAG_NAME:
|
||||||
|
return
|
||||||
|
|
||||||
|
numPops = 0
|
||||||
|
mostRecentTag = None
|
||||||
|
|
||||||
|
for i in range(len(self.tagStack) - 1, 0, -1):
|
||||||
|
if (name == self.tagStack[i].name
|
||||||
|
and nsprefix == self.tagStack[i].prefix):
|
||||||
|
numPops = len(self.tagStack) - i
|
||||||
|
break
|
||||||
|
if not inclusivePop:
|
||||||
|
numPops = numPops - 1
|
||||||
|
|
||||||
|
for i in range(0, numPops):
|
||||||
|
mostRecentTag = self.popTag()
|
||||||
|
return mostRecentTag
|
||||||
|
|
||||||
|
def handle_starttag(self, name, namespace, nsprefix, attrs):
|
||||||
|
"""Push a start tag on to the stack.
|
||||||
|
|
||||||
|
If this method returns None, the tag was rejected by the
|
||||||
|
SoupStrainer. You should proceed as if the tag had not occured
|
||||||
|
in the document. For instance, if this was a self-closing tag,
|
||||||
|
don't call handle_endtag.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# print "Start tag %s: %s" % (name, attrs)
|
||||||
|
self.endData()
|
||||||
|
|
||||||
|
if (self.parse_only and len(self.tagStack) <= 1
|
||||||
|
and (self.parse_only.text
|
||||||
|
or not self.parse_only.search_tag(name, attrs))):
|
||||||
|
return None
|
||||||
|
|
||||||
|
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
|
||||||
|
self.currentTag, self.previous_element)
|
||||||
|
if tag is None:
|
||||||
|
return tag
|
||||||
|
if self.previous_element:
|
||||||
|
self.previous_element.next_element = tag
|
||||||
|
self.previous_element = tag
|
||||||
|
self.pushTag(tag)
|
||||||
|
return tag
|
||||||
|
|
||||||
|
def handle_endtag(self, name, nsprefix=None):
|
||||||
|
#print "End tag: " + name
|
||||||
|
self.endData()
|
||||||
|
self._popToTag(name, nsprefix)
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
self.currentData.append(data)
|
||||||
|
|
||||||
|
def decode(self, pretty_print=False,
|
||||||
|
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
||||||
|
formatter="minimal"):
|
||||||
|
"""Returns a string or Unicode representation of this document.
|
||||||
|
To get Unicode, pass None for encoding."""
|
||||||
|
|
||||||
|
if self.is_xml:
|
||||||
|
# Print the XML declaration
|
||||||
|
encoding_part = ''
|
||||||
|
if eventual_encoding != None:
|
||||||
|
encoding_part = ' encoding="%s"' % eventual_encoding
|
||||||
|
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
|
||||||
|
else:
|
||||||
|
prefix = u''
|
||||||
|
if not pretty_print:
|
||||||
|
indent_level = None
|
||||||
|
else:
|
||||||
|
indent_level = 0
|
||||||
|
return prefix + super(BeautifulSoup, self).decode(
|
||||||
|
indent_level, eventual_encoding, formatter)
|
||||||
|
|
||||||
|
class BeautifulStoneSoup(BeautifulSoup):
|
||||||
|
"""Deprecated interface to an XML parser."""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
kwargs['features'] = 'xml'
|
||||||
|
warnings.warn(
|
||||||
|
'The BeautifulStoneSoup class is deprecated. Instead of using '
|
||||||
|
'it, pass features="xml" into the BeautifulSoup constructor.')
|
||||||
|
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class StopParsing(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class FeatureNotFound(ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
#By default, act as an HTML pretty-printer.
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import sys
|
||||||
|
soup = BeautifulSoup(sys.stdin)
|
||||||
|
print soup.prettify()
|
@ -0,0 +1,316 @@
|
|||||||
|
from collections import defaultdict
|
||||||
|
import itertools
|
||||||
|
import sys
|
||||||
|
from bs4.element import (
|
||||||
|
CharsetMetaAttributeValue,
|
||||||
|
ContentMetaAttributeValue,
|
||||||
|
whitespace_re
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'HTMLTreeBuilder',
|
||||||
|
'SAXTreeBuilder',
|
||||||
|
'TreeBuilder',
|
||||||
|
'TreeBuilderRegistry',
|
||||||
|
]
|
||||||
|
|
||||||
|
# Some useful features for a TreeBuilder to have.
|
||||||
|
FAST = 'fast'
|
||||||
|
PERMISSIVE = 'permissive'
|
||||||
|
STRICT = 'strict'
|
||||||
|
XML = 'xml'
|
||||||
|
HTML = 'html'
|
||||||
|
HTML_5 = 'html5'
|
||||||
|
|
||||||
|
|
||||||
|
class TreeBuilderRegistry(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.builders_for_feature = defaultdict(list)
|
||||||
|
self.builders = []
|
||||||
|
|
||||||
|
def register(self, treebuilder_class):
|
||||||
|
"""Register a treebuilder based on its advertised features."""
|
||||||
|
for feature in treebuilder_class.features:
|
||||||
|
self.builders_for_feature[feature].insert(0, treebuilder_class)
|
||||||
|
self.builders.insert(0, treebuilder_class)
|
||||||
|
|
||||||
|
def lookup(self, *features):
|
||||||
|
if len(self.builders) == 0:
|
||||||
|
# There are no builders at all.
|
||||||
|
return None
|
||||||
|
|
||||||
|
if len(features) == 0:
|
||||||
|
# They didn't ask for any features. Give them the most
|
||||||
|
# recently registered builder.
|
||||||
|
return self.builders[0]
|
||||||
|
|
||||||
|
# Go down the list of features in order, and eliminate any builders
|
||||||
|
# that don't match every feature.
|
||||||
|
features = list(features)
|
||||||
|
features.reverse()
|
||||||
|
candidates = None
|
||||||
|
candidate_set = None
|
||||||
|
while len(features) > 0:
|
||||||
|
feature = features.pop()
|
||||||
|
we_have_the_feature = self.builders_for_feature.get(feature, [])
|
||||||
|
if len(we_have_the_feature) > 0:
|
||||||
|
if candidates is None:
|
||||||
|
candidates = we_have_the_feature
|
||||||
|
candidate_set = set(candidates)
|
||||||
|
else:
|
||||||
|
# Eliminate any candidates that don't have this feature.
|
||||||
|
candidate_set = candidate_set.intersection(
|
||||||
|
set(we_have_the_feature))
|
||||||
|
|
||||||
|
# The only valid candidates are the ones in candidate_set.
|
||||||
|
# Go through the original list of candidates and pick the first one
|
||||||
|
# that's in candidate_set.
|
||||||
|
if candidate_set is None:
|
||||||
|
return None
|
||||||
|
for candidate in candidates:
|
||||||
|
if candidate in candidate_set:
|
||||||
|
return candidate
|
||||||
|
return None
|
||||||
|
|
||||||
|
# The BeautifulSoup class will take feature lists from developers and use them
|
||||||
|
# to look up builders in this registry.
|
||||||
|
builder_registry = TreeBuilderRegistry()
|
||||||
|
|
||||||
|
class TreeBuilder(object):
|
||||||
|
"""Turn a document into a Beautiful Soup object tree."""
|
||||||
|
|
||||||
|
features = []
|
||||||
|
|
||||||
|
is_xml = False
|
||||||
|
preserve_whitespace_tags = set()
|
||||||
|
empty_element_tags = None # A tag will be considered an empty-element
|
||||||
|
# tag when and only when it has no contents.
|
||||||
|
|
||||||
|
# A value for these tag/attribute combinations is a space- or
|
||||||
|
# comma-separated list of CDATA, rather than a single CDATA.
|
||||||
|
cdata_list_attributes = {}
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.soup = None
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def can_be_empty_element(self, tag_name):
|
||||||
|
"""Might a tag with this name be an empty-element tag?
|
||||||
|
|
||||||
|
The final markup may or may not actually present this tag as
|
||||||
|
self-closing.
|
||||||
|
|
||||||
|
For instance: an HTMLBuilder does not consider a <p> tag to be
|
||||||
|
an empty-element tag (it's not in
|
||||||
|
HTMLBuilder.empty_element_tags). This means an empty <p> tag
|
||||||
|
will be presented as "<p></p>", not "<p />".
|
||||||
|
|
||||||
|
The default implementation has no opinion about which tags are
|
||||||
|
empty-element tags, so a tag will be presented as an
|
||||||
|
empty-element tag if and only if it has no contents.
|
||||||
|
"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
|
||||||
|
be left alone.
|
||||||
|
"""
|
||||||
|
if self.empty_element_tags is None:
|
||||||
|
return True
|
||||||
|
return tag_name in self.empty_element_tags
|
||||||
|
|
||||||
|
def feed(self, markup):
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||||
|
document_declared_encoding=None):
|
||||||
|
return markup, None, None, False
|
||||||
|
|
||||||
|
def test_fragment_to_document(self, fragment):
|
||||||
|
"""Wrap an HTML fragment to make it look like a document.
|
||||||
|
|
||||||
|
Different parsers do this differently. For instance, lxml
|
||||||
|
introduces an empty <head> tag, and html5lib
|
||||||
|
doesn't. Abstracting this away lets us write simple tests
|
||||||
|
which run HTML fragments through the parser and compare the
|
||||||
|
results against other HTML fragments.
|
||||||
|
|
||||||
|
This method should not be used outside of tests.
|
||||||
|
"""
|
||||||
|
return fragment
|
||||||
|
|
||||||
|
def set_up_substitutions(self, tag):
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _replace_cdata_list_attribute_values(self, tag_name, attrs):
|
||||||
|
"""Replaces class="foo bar" with class=["foo", "bar"]
|
||||||
|
|
||||||
|
Modifies its input in place.
|
||||||
|
"""
|
||||||
|
if self.cdata_list_attributes:
|
||||||
|
universal = self.cdata_list_attributes.get('*', [])
|
||||||
|
tag_specific = self.cdata_list_attributes.get(
|
||||||
|
tag_name.lower(), [])
|
||||||
|
for cdata_list_attr in itertools.chain(universal, tag_specific):
|
||||||
|
if cdata_list_attr in dict(attrs):
|
||||||
|
# Basically, we have a "class" attribute whose
|
||||||
|
# value is a whitespace-separated list of CSS
|
||||||
|
# classes. Split it into a list.
|
||||||
|
value = attrs[cdata_list_attr]
|
||||||
|
if isinstance(value, basestring):
|
||||||
|
values = whitespace_re.split(value)
|
||||||
|
else:
|
||||||
|
# html5lib sometimes calls setAttributes twice
|
||||||
|
# for the same tag when rearranging the parse
|
||||||
|
# tree. On the second call the attribute value
|
||||||
|
# here is already a list. If this happens,
|
||||||
|
# leave the value alone rather than trying to
|
||||||
|
# split it again.
|
||||||
|
values = value
|
||||||
|
attrs[cdata_list_attr] = values
|
||||||
|
return attrs
|
||||||
|
|
||||||
|
class SAXTreeBuilder(TreeBuilder):
|
||||||
|
"""A Beautiful Soup treebuilder that listens for SAX events."""
|
||||||
|
|
||||||
|
def feed(self, markup):
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def startElement(self, name, attrs):
|
||||||
|
attrs = dict((key[1], value) for key, value in list(attrs.items()))
|
||||||
|
#print "Start %s, %r" % (name, attrs)
|
||||||
|
self.soup.handle_starttag(name, attrs)
|
||||||
|
|
||||||
|
def endElement(self, name):
|
||||||
|
#print "End %s" % name
|
||||||
|
self.soup.handle_endtag(name)
|
||||||
|
|
||||||
|
def startElementNS(self, nsTuple, nodeName, attrs):
|
||||||
|
# Throw away (ns, nodeName) for now.
|
||||||
|
self.startElement(nodeName, attrs)
|
||||||
|
|
||||||
|
def endElementNS(self, nsTuple, nodeName):
|
||||||
|
# Throw away (ns, nodeName) for now.
|
||||||
|
self.endElement(nodeName)
|
||||||
|
#handler.endElementNS((ns, node.nodeName), node.nodeName)
|
||||||
|
|
||||||
|
def startPrefixMapping(self, prefix, nodeValue):
|
||||||
|
# Ignore the prefix for now.
|
||||||
|
pass
|
||||||
|
|
||||||
|
def endPrefixMapping(self, prefix):
|
||||||
|
# Ignore the prefix for now.
|
||||||
|
# handler.endPrefixMapping(prefix)
|
||||||
|
pass
|
||||||
|
|
||||||
|
def characters(self, content):
|
||||||
|
self.soup.handle_data(content)
|
||||||
|
|
||||||
|
def startDocument(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def endDocument(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLTreeBuilder(TreeBuilder):
|
||||||
|
"""This TreeBuilder knows facts about HTML.
|
||||||
|
|
||||||
|
Such as which tags are empty-element tags.
|
||||||
|
"""
|
||||||
|
|
||||||
|
preserve_whitespace_tags = set(['pre', 'textarea'])
|
||||||
|
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
|
||||||
|
'spacer', 'link', 'frame', 'base'])
|
||||||
|
|
||||||
|
# The HTML standard defines these attributes as containing a
|
||||||
|
# space-separated list of values, not a single value. That is,
|
||||||
|
# class="foo bar" means that the 'class' attribute has two values,
|
||||||
|
# 'foo' and 'bar', not the single value 'foo bar'. When we
|
||||||
|
# encounter one of these attributes, we will parse its value into
|
||||||
|
# a list of values if possible. Upon output, the list will be
|
||||||
|
# converted back into a string.
|
||||||
|
cdata_list_attributes = {
|
||||||
|
"*" : ['class', 'accesskey', 'dropzone'],
|
||||||
|
"a" : ['rel', 'rev'],
|
||||||
|
"link" : ['rel', 'rev'],
|
||||||
|
"td" : ["headers"],
|
||||||
|
"th" : ["headers"],
|
||||||
|
"td" : ["headers"],
|
||||||
|
"form" : ["accept-charset"],
|
||||||
|
"object" : ["archive"],
|
||||||
|
|
||||||
|
# These are HTML5 specific, as are *.accesskey and *.dropzone above.
|
||||||
|
"area" : ["rel"],
|
||||||
|
"icon" : ["sizes"],
|
||||||
|
"iframe" : ["sandbox"],
|
||||||
|
"output" : ["for"],
|
||||||
|
}
|
||||||
|
|
||||||
|
def set_up_substitutions(self, tag):
|
||||||
|
# We are only interested in <meta> tags
|
||||||
|
if tag.name != 'meta':
|
||||||
|
return False
|
||||||
|
|
||||||
|
http_equiv = tag.get('http-equiv')
|
||||||
|
content = tag.get('content')
|
||||||
|
charset = tag.get('charset')
|
||||||
|
|
||||||
|
# We are interested in <meta> tags that say what encoding the
|
||||||
|
# document was originally in. This means HTML 5-style <meta>
|
||||||
|
# tags that provide the "charset" attribute. It also means
|
||||||
|
# HTML 4-style <meta> tags that provide the "content"
|
||||||
|
# attribute and have "http-equiv" set to "content-type".
|
||||||
|
#
|
||||||
|
# In both cases we will replace the value of the appropriate
|
||||||
|
# attribute with a standin object that can take on any
|
||||||
|
# encoding.
|
||||||
|
meta_encoding = None
|
||||||
|
if charset is not None:
|
||||||
|
# HTML 5 style:
|
||||||
|
# <meta charset="utf8">
|
||||||
|
meta_encoding = charset
|
||||||
|
tag['charset'] = CharsetMetaAttributeValue(charset)
|
||||||
|
|
||||||
|
elif (content is not None and http_equiv is not None
|
||||||
|
and http_equiv.lower() == 'content-type'):
|
||||||
|
# HTML 4 style:
|
||||||
|
# <meta http-equiv="content-type" content="text/html; charset=utf8">
|
||||||
|
tag['content'] = ContentMetaAttributeValue(content)
|
||||||
|
|
||||||
|
return (meta_encoding is not None)
|
||||||
|
|
||||||
|
def register_treebuilders_from(module):
|
||||||
|
"""Copy TreeBuilders from the given module into this module."""
|
||||||
|
# I'm fairly sure this is not the best way to do this.
|
||||||
|
this_module = sys.modules['bs4.builder']
|
||||||
|
for name in module.__all__:
|
||||||
|
obj = getattr(module, name)
|
||||||
|
|
||||||
|
if issubclass(obj, TreeBuilder):
|
||||||
|
setattr(this_module, name, obj)
|
||||||
|
this_module.__all__.append(name)
|
||||||
|
# Register the builder while we're at it.
|
||||||
|
this_module.builder_registry.register(obj)
|
||||||
|
|
||||||
|
# Builders are registered in reverse order of priority, so that custom
|
||||||
|
# builder registrations will take precedence. In general, we want lxml
|
||||||
|
# to take precedence over html5lib, because it's faster. And we only
|
||||||
|
# want to use HTMLParser as a last result.
|
||||||
|
from . import _htmlparser
|
||||||
|
register_treebuilders_from(_htmlparser)
|
||||||
|
try:
|
||||||
|
from . import _html5lib
|
||||||
|
register_treebuilders_from(_html5lib)
|
||||||
|
except ImportError:
|
||||||
|
# They don't have html5lib installed.
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
from . import _lxml
|
||||||
|
register_treebuilders_from(_lxml)
|
||||||
|
except ImportError:
|
||||||
|
# They don't have lxml installed.
|
||||||
|
pass
|
@ -0,0 +1,221 @@
|
|||||||
|
__all__ = [
|
||||||
|
'HTML5TreeBuilder',
|
||||||
|
]
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
from bs4.builder import (
|
||||||
|
PERMISSIVE,
|
||||||
|
HTML,
|
||||||
|
HTML_5,
|
||||||
|
HTMLTreeBuilder,
|
||||||
|
)
|
||||||
|
from bs4.element import NamespacedAttribute
|
||||||
|
import html5lib
|
||||||
|
from html5lib.constants import namespaces
|
||||||
|
from bs4.element import (
|
||||||
|
Comment,
|
||||||
|
Doctype,
|
||||||
|
NavigableString,
|
||||||
|
Tag,
|
||||||
|
)
|
||||||
|
|
||||||
|
class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
|
"""Use html5lib to build a tree."""
|
||||||
|
|
||||||
|
features = ['html5lib', PERMISSIVE, HTML_5, HTML]
|
||||||
|
|
||||||
|
def prepare_markup(self, markup, user_specified_encoding):
|
||||||
|
# Store the user-specified encoding for use later on.
|
||||||
|
self.user_specified_encoding = user_specified_encoding
|
||||||
|
return markup, None, None, False
|
||||||
|
|
||||||
|
# These methods are defined by Beautiful Soup.
|
||||||
|
def feed(self, markup):
|
||||||
|
if self.soup.parse_only is not None:
|
||||||
|
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
|
||||||
|
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
||||||
|
doc = parser.parse(markup, encoding=self.user_specified_encoding)
|
||||||
|
|
||||||
|
# Set the character encoding detected by the tokenizer.
|
||||||
|
if isinstance(markup, unicode):
|
||||||
|
# We need to special-case this because html5lib sets
|
||||||
|
# charEncoding to UTF-8 if it gets Unicode input.
|
||||||
|
doc.original_encoding = None
|
||||||
|
else:
|
||||||
|
doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
|
||||||
|
|
||||||
|
def create_treebuilder(self, namespaceHTMLElements):
|
||||||
|
self.underlying_builder = TreeBuilderForHtml5lib(
|
||||||
|
self.soup, namespaceHTMLElements)
|
||||||
|
return self.underlying_builder
|
||||||
|
|
||||||
|
def test_fragment_to_document(self, fragment):
|
||||||
|
"""See `TreeBuilder`."""
|
||||||
|
return u'<html><head></head><body>%s</body></html>' % fragment
|
||||||
|
|
||||||
|
|
||||||
|
class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
|
||||||
|
|
||||||
|
def __init__(self, soup, namespaceHTMLElements):
|
||||||
|
self.soup = soup
|
||||||
|
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
|
||||||
|
|
||||||
|
def documentClass(self):
|
||||||
|
self.soup.reset()
|
||||||
|
return Element(self.soup, self.soup, None)
|
||||||
|
|
||||||
|
def insertDoctype(self, token):
|
||||||
|
name = token["name"]
|
||||||
|
publicId = token["publicId"]
|
||||||
|
systemId = token["systemId"]
|
||||||
|
|
||||||
|
doctype = Doctype.for_name_and_ids(name, publicId, systemId)
|
||||||
|
self.soup.object_was_parsed(doctype)
|
||||||
|
|
||||||
|
def elementClass(self, name, namespace):
|
||||||
|
tag = self.soup.new_tag(name, namespace)
|
||||||
|
return Element(tag, self.soup, namespace)
|
||||||
|
|
||||||
|
def commentClass(self, data):
|
||||||
|
return TextNode(Comment(data), self.soup)
|
||||||
|
|
||||||
|
def fragmentClass(self):
|
||||||
|
self.soup = BeautifulSoup("")
|
||||||
|
self.soup.name = "[document_fragment]"
|
||||||
|
return Element(self.soup, self.soup, None)
|
||||||
|
|
||||||
|
def appendChild(self, node):
|
||||||
|
# XXX This code is not covered by the BS4 tests.
|
||||||
|
self.soup.append(node.element)
|
||||||
|
|
||||||
|
def getDocument(self):
|
||||||
|
return self.soup
|
||||||
|
|
||||||
|
def getFragment(self):
|
||||||
|
return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
|
||||||
|
|
||||||
|
class AttrList(object):
|
||||||
|
def __init__(self, element):
|
||||||
|
self.element = element
|
||||||
|
self.attrs = dict(self.element.attrs)
|
||||||
|
def __iter__(self):
|
||||||
|
return list(self.attrs.items()).__iter__()
|
||||||
|
def __setitem__(self, name, value):
|
||||||
|
"set attr", name, value
|
||||||
|
self.element[name] = value
|
||||||
|
def items(self):
|
||||||
|
return list(self.attrs.items())
|
||||||
|
def keys(self):
|
||||||
|
return list(self.attrs.keys())
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.attrs)
|
||||||
|
def __getitem__(self, name):
|
||||||
|
return self.attrs[name]
|
||||||
|
def __contains__(self, name):
|
||||||
|
return name in list(self.attrs.keys())
|
||||||
|
|
||||||
|
|
||||||
|
class Element(html5lib.treebuilders._base.Node):
|
||||||
|
def __init__(self, element, soup, namespace):
|
||||||
|
html5lib.treebuilders._base.Node.__init__(self, element.name)
|
||||||
|
self.element = element
|
||||||
|
self.soup = soup
|
||||||
|
self.namespace = namespace
|
||||||
|
|
||||||
|
def appendChild(self, node):
|
||||||
|
if (node.element.__class__ == NavigableString and self.element.contents
|
||||||
|
and self.element.contents[-1].__class__ == NavigableString):
|
||||||
|
# Concatenate new text onto old text node
|
||||||
|
# XXX This has O(n^2) performance, for input like
|
||||||
|
# "a</a>a</a>a</a>..."
|
||||||
|
old_element = self.element.contents[-1]
|
||||||
|
new_element = self.soup.new_string(old_element + node.element)
|
||||||
|
old_element.replace_with(new_element)
|
||||||
|
else:
|
||||||
|
self.soup.object_was_parsed(node.element, parent=self.element)
|
||||||
|
|
||||||
|
def getAttributes(self):
|
||||||
|
return AttrList(self.element)
|
||||||
|
|
||||||
|
def setAttributes(self, attributes):
|
||||||
|
if attributes is not None and len(attributes) > 0:
|
||||||
|
|
||||||
|
converted_attributes = []
|
||||||
|
for name, value in list(attributes.items()):
|
||||||
|
if isinstance(name, tuple):
|
||||||
|
new_name = NamespacedAttribute(*name)
|
||||||
|
del attributes[name]
|
||||||
|
attributes[new_name] = value
|
||||||
|
|
||||||
|
self.soup.builder._replace_cdata_list_attribute_values(
|
||||||
|
self.name, attributes)
|
||||||
|
for name, value in attributes.items():
|
||||||
|
self.element[name] = value
|
||||||
|
|
||||||
|
# The attributes may contain variables that need substitution.
|
||||||
|
# Call set_up_substitutions manually.
|
||||||
|
#
|
||||||
|
# The Tag constructor called this method when the Tag was created,
|
||||||
|
# but we just set/changed the attributes, so call it again.
|
||||||
|
self.soup.builder.set_up_substitutions(self.element)
|
||||||
|
attributes = property(getAttributes, setAttributes)
|
||||||
|
|
||||||
|
def insertText(self, data, insertBefore=None):
|
||||||
|
text = TextNode(self.soup.new_string(data), self.soup)
|
||||||
|
if insertBefore:
|
||||||
|
self.insertBefore(text, insertBefore)
|
||||||
|
else:
|
||||||
|
self.appendChild(text)
|
||||||
|
|
||||||
|
def insertBefore(self, node, refNode):
|
||||||
|
index = self.element.index(refNode.element)
|
||||||
|
if (node.element.__class__ == NavigableString and self.element.contents
|
||||||
|
and self.element.contents[index-1].__class__ == NavigableString):
|
||||||
|
# (See comments in appendChild)
|
||||||
|
old_node = self.element.contents[index-1]
|
||||||
|
new_str = self.soup.new_string(old_node + node.element)
|
||||||
|
old_node.replace_with(new_str)
|
||||||
|
else:
|
||||||
|
self.element.insert(index, node.element)
|
||||||
|
node.parent = self
|
||||||
|
|
||||||
|
def removeChild(self, node):
|
||||||
|
node.element.extract()
|
||||||
|
|
||||||
|
def reparentChildren(self, newParent):
|
||||||
|
while self.element.contents:
|
||||||
|
child = self.element.contents[0]
|
||||||
|
child.extract()
|
||||||
|
if isinstance(child, Tag):
|
||||||
|
newParent.appendChild(
|
||||||
|
Element(child, self.soup, namespaces["html"]))
|
||||||
|
else:
|
||||||
|
newParent.appendChild(
|
||||||
|
TextNode(child, self.soup))
|
||||||
|
|
||||||
|
def cloneNode(self):
|
||||||
|
tag = self.soup.new_tag(self.element.name, self.namespace)
|
||||||
|
node = Element(tag, self.soup, self.namespace)
|
||||||
|
for key,value in self.attributes:
|
||||||
|
node.attributes[key] = value
|
||||||
|
return node
|
||||||
|
|
||||||
|
def hasContent(self):
|
||||||
|
return self.element.contents
|
||||||
|
|
||||||
|
def getNameTuple(self):
|
||||||
|
if self.namespace == None:
|
||||||
|
return namespaces["html"], self.name
|
||||||
|
else:
|
||||||
|
return self.namespace, self.name
|
||||||
|
|
||||||
|
nameTuple = property(getNameTuple)
|
||||||
|
|
||||||
|
class TextNode(Element):
|
||||||
|
def __init__(self, element, soup):
|
||||||
|
html5lib.treebuilders._base.Node.__init__(self, None)
|
||||||
|
self.element = element
|
||||||
|
self.soup = soup
|
||||||
|
|
||||||
|
def cloneNode(self):
|
||||||
|
raise NotImplementedError
|
@ -0,0 +1,244 @@
|
|||||||
|
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'HTMLParserTreeBuilder',
|
||||||
|
]
|
||||||
|
|
||||||
|
from HTMLParser import (
|
||||||
|
HTMLParser,
|
||||||
|
HTMLParseError,
|
||||||
|
)
|
||||||
|
import sys
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
|
||||||
|
# argument, which we'd like to set to False. Unfortunately,
|
||||||
|
# http://bugs.python.org/issue13273 makes strict=True a better bet
|
||||||
|
# before Python 3.2.3.
|
||||||
|
#
|
||||||
|
# At the end of this file, we monkeypatch HTMLParser so that
|
||||||
|
# strict=True works well on Python 3.2.2.
|
||||||
|
major, minor, release = sys.version_info[:3]
|
||||||
|
CONSTRUCTOR_TAKES_STRICT = (
|
||||||
|
major > 3
|
||||||
|
or (major == 3 and minor > 2)
|
||||||
|
or (major == 3 and minor == 2 and release >= 3))
|
||||||
|
|
||||||
|
from bs4.element import (
|
||||||
|
CData,
|
||||||
|
Comment,
|
||||||
|
Declaration,
|
||||||
|
Doctype,
|
||||||
|
ProcessingInstruction,
|
||||||
|
)
|
||||||
|
from bs4.dammit import EntitySubstitution, UnicodeDammit
|
||||||
|
|
||||||
|
from bs4.builder import (
|
||||||
|
HTML,
|
||||||
|
HTMLTreeBuilder,
|
||||||
|
STRICT,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
HTMLPARSER = 'html.parser'
|
||||||
|
|
||||||
|
class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
|
def handle_starttag(self, name, attrs):
|
||||||
|
# XXX namespace
|
||||||
|
self.soup.handle_starttag(name, None, None, dict(attrs))
|
||||||
|
|
||||||
|
def handle_endtag(self, name):
|
||||||
|
self.soup.handle_endtag(name)
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
self.soup.handle_data(data)
|
||||||
|
|
||||||
|
def handle_charref(self, name):
|
||||||
|
# XXX workaround for a bug in HTMLParser. Remove this once
|
||||||
|
# it's fixed.
|
||||||
|
if name.startswith('x'):
|
||||||
|
real_name = int(name.lstrip('x'), 16)
|
||||||
|
else:
|
||||||
|
real_name = int(name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = unichr(real_name)
|
||||||
|
except (ValueError, OverflowError), e:
|
||||||
|
data = u"\N{REPLACEMENT CHARACTER}"
|
||||||
|
|
||||||
|
self.handle_data(data)
|
||||||
|
|
||||||
|
def handle_entityref(self, name):
|
||||||
|
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
|
||||||
|
if character is not None:
|
||||||
|
data = character
|
||||||
|
else:
|
||||||
|
data = "&%s;" % name
|
||||||
|
self.handle_data(data)
|
||||||
|
|
||||||
|
def handle_comment(self, data):
|
||||||
|
self.soup.endData()
|
||||||
|
self.soup.handle_data(data)
|
||||||
|
self.soup.endData(Comment)
|
||||||
|
|
||||||
|
def handle_decl(self, data):
|
||||||
|
self.soup.endData()
|
||||||
|
if data.startswith("DOCTYPE "):
|
||||||
|
data = data[len("DOCTYPE "):]
|
||||||
|
self.soup.handle_data(data)
|
||||||
|
self.soup.endData(Doctype)
|
||||||
|
|
||||||
|
def unknown_decl(self, data):
|
||||||
|
if data.upper().startswith('CDATA['):
|
||||||
|
cls = CData
|
||||||
|
data = data[len('CDATA['):]
|
||||||
|
else:
|
||||||
|
cls = Declaration
|
||||||
|
self.soup.endData()
|
||||||
|
self.soup.handle_data(data)
|
||||||
|
self.soup.endData(cls)
|
||||||
|
|
||||||
|
def handle_pi(self, data):
|
||||||
|
self.soup.endData()
|
||||||
|
if data.endswith("?") and data.lower().startswith("xml"):
|
||||||
|
# "An XHTML processing instruction using the trailing '?'
|
||||||
|
# will cause the '?' to be included in data." - HTMLParser
|
||||||
|
# docs.
|
||||||
|
#
|
||||||
|
# Strip the question mark so we don't end up with two
|
||||||
|
# question marks.
|
||||||
|
data = data[:-1]
|
||||||
|
self.soup.handle_data(data)
|
||||||
|
self.soup.endData(ProcessingInstruction)
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||||
|
|
||||||
|
is_xml = False
|
||||||
|
features = [HTML, STRICT, HTMLPARSER]
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
if CONSTRUCTOR_TAKES_STRICT:
|
||||||
|
kwargs['strict'] = False
|
||||||
|
self.parser_args = (args, kwargs)
|
||||||
|
|
||||||
|
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||||
|
document_declared_encoding=None):
|
||||||
|
"""
|
||||||
|
:return: A 4-tuple (markup, original encoding, encoding
|
||||||
|
declared within markup, whether any characters had to be
|
||||||
|
replaced with REPLACEMENT CHARACTER).
|
||||||
|
"""
|
||||||
|
if isinstance(markup, unicode):
|
||||||
|
return markup, None, None, False
|
||||||
|
|
||||||
|
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||||
|
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
|
||||||
|
return (dammit.markup, dammit.original_encoding,
|
||||||
|
dammit.declared_html_encoding,
|
||||||
|
dammit.contains_replacement_characters)
|
||||||
|
|
||||||
|
def feed(self, markup):
|
||||||
|
args, kwargs = self.parser_args
|
||||||
|
parser = BeautifulSoupHTMLParser(*args, **kwargs)
|
||||||
|
parser.soup = self.soup
|
||||||
|
try:
|
||||||
|
parser.feed(markup)
|
||||||
|
except HTMLParseError, e:
|
||||||
|
warnings.warn(RuntimeWarning(
|
||||||
|
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
||||||
|
raise e
|
||||||
|
|
||||||
|
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
|
||||||
|
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
|
||||||
|
# string.
|
||||||
|
#
|
||||||
|
# XXX This code can be removed once most Python 3 users are on 3.2.3.
|
||||||
|
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
|
||||||
|
import re
|
||||||
|
attrfind_tolerant = re.compile(
|
||||||
|
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
|
||||||
|
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
|
||||||
|
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
|
||||||
|
|
||||||
|
locatestarttagend = re.compile(r"""
|
||||||
|
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
|
||||||
|
(?:\s+ # whitespace before attribute name
|
||||||
|
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
|
||||||
|
(?:\s*=\s* # value indicator
|
||||||
|
(?:'[^']*' # LITA-enclosed value
|
||||||
|
|\"[^\"]*\" # LIT-enclosed value
|
||||||
|
|[^'\">\s]+ # bare value
|
||||||
|
)
|
||||||
|
)?
|
||||||
|
)
|
||||||
|
)*
|
||||||
|
\s* # trailing whitespace
|
||||||
|
""", re.VERBOSE)
|
||||||
|
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
|
||||||
|
|
||||||
|
from html.parser import tagfind, attrfind
|
||||||
|
|
||||||
|
def parse_starttag(self, i):
|
||||||
|
self.__starttag_text = None
|
||||||
|
endpos = self.check_for_whole_start_tag(i)
|
||||||
|
if endpos < 0:
|
||||||
|
return endpos
|
||||||
|
rawdata = self.rawdata
|
||||||
|
self.__starttag_text = rawdata[i:endpos]
|
||||||
|
|
||||||
|
# Now parse the data between i+1 and j into a tag and attrs
|
||||||
|
attrs = []
|
||||||
|
match = tagfind.match(rawdata, i+1)
|
||||||
|
assert match, 'unexpected call to parse_starttag()'
|
||||||
|
k = match.end()
|
||||||
|
self.lasttag = tag = rawdata[i+1:k].lower()
|
||||||
|
while k < endpos:
|
||||||
|
if self.strict:
|
||||||
|
m = attrfind.match(rawdata, k)
|
||||||
|
else:
|
||||||
|
m = attrfind_tolerant.match(rawdata, k)
|
||||||
|
if not m:
|
||||||
|
break
|
||||||
|
attrname, rest, attrvalue = m.group(1, 2, 3)
|
||||||
|
if not rest:
|
||||||
|
attrvalue = None
|
||||||
|
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
|
||||||
|
attrvalue[:1] == '"' == attrvalue[-1:]:
|
||||||
|
attrvalue = attrvalue[1:-1]
|
||||||
|
if attrvalue:
|
||||||
|
attrvalue = self.unescape(attrvalue)
|
||||||
|
attrs.append((attrname.lower(), attrvalue))
|
||||||
|
k = m.end()
|
||||||
|
|
||||||
|
end = rawdata[k:endpos].strip()
|
||||||
|
if end not in (">", "/>"):
|
||||||
|
lineno, offset = self.getpos()
|
||||||
|
if "\n" in self.__starttag_text:
|
||||||
|
lineno = lineno + self.__starttag_text.count("\n")
|
||||||
|
offset = len(self.__starttag_text) \
|
||||||
|
- self.__starttag_text.rfind("\n")
|
||||||
|
else:
|
||||||
|
offset = offset + len(self.__starttag_text)
|
||||||
|
if self.strict:
|
||||||
|
self.error("junk characters in start tag: %r"
|
||||||
|
% (rawdata[k:endpos][:20],))
|
||||||
|
self.handle_data(rawdata[i:endpos])
|
||||||
|
return endpos
|
||||||
|
if end.endswith('/>'):
|
||||||
|
# XHTML-style empty tag: <span attr="value" />
|
||||||
|
self.handle_startendtag(tag, attrs)
|
||||||
|
else:
|
||||||
|
self.handle_starttag(tag, attrs)
|
||||||
|
if tag in self.CDATA_CONTENT_ELEMENTS:
|
||||||
|
self.set_cdata_mode(tag)
|
||||||
|
return endpos
|
||||||
|
|
||||||
|
def set_cdata_mode(self, elem):
|
||||||
|
self.cdata_elem = elem.lower()
|
||||||
|
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
|
||||||
|
|
||||||
|
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
|
||||||
|
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
|
||||||
|
|
||||||
|
CONSTRUCTOR_TAKES_STRICT = True
|
@ -0,0 +1,196 @@
|
|||||||
|
__all__ = [
|
||||||
|
'LXMLTreeBuilderForXML',
|
||||||
|
'LXMLTreeBuilder',
|
||||||
|
]
|
||||||
|
|
||||||
|
from StringIO import StringIO
|
||||||
|
import collections
|
||||||
|
from lxml import etree
|
||||||
|
from bs4.element import Comment, Doctype, NamespacedAttribute
|
||||||
|
from bs4.builder import (
|
||||||
|
FAST,
|
||||||
|
HTML,
|
||||||
|
HTMLTreeBuilder,
|
||||||
|
PERMISSIVE,
|
||||||
|
TreeBuilder,
|
||||||
|
XML)
|
||||||
|
from bs4.dammit import UnicodeDammit
|
||||||
|
|
||||||
|
LXML = 'lxml'
|
||||||
|
|
||||||
|
class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
|
DEFAULT_PARSER_CLASS = etree.XMLParser
|
||||||
|
|
||||||
|
is_xml = True
|
||||||
|
|
||||||
|
# Well, it's permissive by XML parser standards.
|
||||||
|
features = [LXML, XML, FAST, PERMISSIVE]
|
||||||
|
|
||||||
|
CHUNK_SIZE = 512
|
||||||
|
|
||||||
|
# This namespace mapping is specified in the XML Namespace
|
||||||
|
# standard.
|
||||||
|
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def default_parser(self):
|
||||||
|
# This can either return a parser object or a class, which
|
||||||
|
# will be instantiated with default arguments.
|
||||||
|
return etree.XMLParser(target=self, strip_cdata=False, recover=True)
|
||||||
|
|
||||||
|
def __init__(self, parser=None, empty_element_tags=None):
|
||||||
|
if empty_element_tags is not None:
|
||||||
|
self.empty_element_tags = set(empty_element_tags)
|
||||||
|
if parser is None:
|
||||||
|
# Use the default parser.
|
||||||
|
parser = self.default_parser
|
||||||
|
if isinstance(parser, collections.Callable):
|
||||||
|
# Instantiate the parser with default arguments
|
||||||
|
parser = parser(target=self, strip_cdata=False)
|
||||||
|
self.parser = parser
|
||||||
|
self.soup = None
|
||||||
|
self.nsmaps = [self.DEFAULT_NSMAPS]
|
||||||
|
|
||||||
|
def _getNsTag(self, tag):
|
||||||
|
# Split the namespace URL out of a fully-qualified lxml tag
|
||||||
|
# name. Copied from lxml's src/lxml/sax.py.
|
||||||
|
if tag[0] == '{':
|
||||||
|
return tuple(tag[1:].split('}', 1))
|
||||||
|
else:
|
||||||
|
return (None, tag)
|
||||||
|
|
||||||
|
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||||
|
document_declared_encoding=None):
|
||||||
|
"""
|
||||||
|
:return: A 3-tuple (markup, original encoding, encoding
|
||||||
|
declared within markup).
|
||||||
|
"""
|
||||||
|
if isinstance(markup, unicode):
|
||||||
|
return markup, None, None, False
|
||||||
|
|
||||||
|
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||||
|
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
|
||||||
|
return (dammit.markup, dammit.original_encoding,
|
||||||
|
dammit.declared_html_encoding,
|
||||||
|
dammit.contains_replacement_characters)
|
||||||
|
|
||||||
|
def feed(self, markup):
|
||||||
|
if isinstance(markup, basestring):
|
||||||
|
markup = StringIO(markup)
|
||||||
|
# Call feed() at least once, even if the markup is empty,
|
||||||
|
# or the parser won't be initialized.
|
||||||
|
data = markup.read(self.CHUNK_SIZE)
|
||||||
|
self.parser.feed(data)
|
||||||
|
while data != '':
|
||||||
|
# Now call feed() on the rest of the data, chunk by chunk.
|
||||||
|
data = markup.read(self.CHUNK_SIZE)
|
||||||
|
if data != '':
|
||||||
|
self.parser.feed(data)
|
||||||
|
self.parser.close()
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.nsmaps = [self.DEFAULT_NSMAPS]
|
||||||
|
|
||||||
|
def start(self, name, attrs, nsmap={}):
|
||||||
|
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
|
||||||
|
attrs = dict(attrs)
|
||||||
|
nsprefix = None
|
||||||
|
# Invert each namespace map as it comes in.
|
||||||
|
if len(self.nsmaps) > 1:
|
||||||
|
# There are no new namespaces for this tag, but
|
||||||
|
# non-default namespaces are in play, so we need a
|
||||||
|
# separate tag stack to know when they end.
|
||||||
|
self.nsmaps.append(None)
|
||||||
|
elif len(nsmap) > 0:
|
||||||
|
# A new namespace mapping has come into play.
|
||||||
|
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
|
||||||
|
self.nsmaps.append(inverted_nsmap)
|
||||||
|
# Also treat the namespace mapping as a set of attributes on the
|
||||||
|
# tag, so we can recreate it later.
|
||||||
|
attrs = attrs.copy()
|
||||||
|
for prefix, namespace in nsmap.items():
|
||||||
|
attribute = NamespacedAttribute(
|
||||||
|
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
|
||||||
|
attrs[attribute] = namespace
|
||||||
|
|
||||||
|
# Namespaces are in play. Find any attributes that came in
|
||||||
|
# from lxml with namespaces attached to their names, and
|
||||||
|
# turn then into NamespacedAttribute objects.
|
||||||
|
new_attrs = {}
|
||||||
|
for attr, value in attrs.items():
|
||||||
|
namespace, attr = self._getNsTag(attr)
|
||||||
|
if namespace is None:
|
||||||
|
new_attrs[attr] = value
|
||||||
|
else:
|
||||||
|
nsprefix = self._prefix_for_namespace(namespace)
|
||||||
|
attr = NamespacedAttribute(nsprefix, attr, namespace)
|
||||||
|
new_attrs[attr] = value
|
||||||
|
attrs = new_attrs
|
||||||
|
|
||||||
|
namespace, name = self._getNsTag(name)
|
||||||
|
nsprefix = self._prefix_for_namespace(namespace)
|
||||||
|
self.soup.handle_starttag(name, namespace, nsprefix, attrs)
|
||||||
|
|
||||||
|
def _prefix_for_namespace(self, namespace):
|
||||||
|
"""Find the currently active prefix for the given namespace."""
|
||||||
|
if namespace is None:
|
||||||
|
return None
|
||||||
|
for inverted_nsmap in reversed(self.nsmaps):
|
||||||
|
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||||
|
return inverted_nsmap[namespace]
|
||||||
|
return None
|
||||||
|
|
||||||
|
def end(self, name):
|
||||||
|
self.soup.endData()
|
||||||
|
completed_tag = self.soup.tagStack[-1]
|
||||||
|
namespace, name = self._getNsTag(name)
|
||||||
|
nsprefix = None
|
||||||
|
if namespace is not None:
|
||||||
|
for inverted_nsmap in reversed(self.nsmaps):
|
||||||
|
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||||
|
nsprefix = inverted_nsmap[namespace]
|
||||||
|
break
|
||||||
|
self.soup.handle_endtag(name, nsprefix)
|
||||||
|
if len(self.nsmaps) > 1:
|
||||||
|
# This tag, or one of its parents, introduced a namespace
|
||||||
|
# mapping, so pop it off the stack.
|
||||||
|
self.nsmaps.pop()
|
||||||
|
|
||||||
|
def pi(self, target, data):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def data(self, content):
|
||||||
|
self.soup.handle_data(content)
|
||||||
|
|
||||||
|
def doctype(self, name, pubid, system):
|
||||||
|
self.soup.endData()
|
||||||
|
doctype = Doctype.for_name_and_ids(name, pubid, system)
|
||||||
|
self.soup.object_was_parsed(doctype)
|
||||||
|
|
||||||
|
def comment(self, content):
|
||||||
|
"Handle comments as Comment objects."
|
||||||
|
self.soup.endData()
|
||||||
|
self.soup.handle_data(content)
|
||||||
|
self.soup.endData(Comment)
|
||||||
|
|
||||||
|
def test_fragment_to_document(self, fragment):
|
||||||
|
"""See `TreeBuilder`."""
|
||||||
|
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
|
||||||
|
|
||||||
|
|
||||||
|
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||||
|
|
||||||
|
features = [LXML, HTML, FAST, PERMISSIVE]
|
||||||
|
is_xml = False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def default_parser(self):
|
||||||
|
return etree.HTMLParser
|
||||||
|
|
||||||
|
def feed(self, markup):
|
||||||
|
self.parser.feed(markup)
|
||||||
|
self.parser.close()
|
||||||
|
|
||||||
|
def test_fragment_to_document(self, fragment):
|
||||||
|
"""See `TreeBuilder`."""
|
||||||
|
return u'<html><body>%s</body></html>' % fragment
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,554 @@
|
|||||||
|
"""Helper classes for tests."""
|
||||||
|
|
||||||
|
import copy
|
||||||
|
import functools
|
||||||
|
import unittest
|
||||||
|
from unittest import TestCase
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4.element import (
|
||||||
|
CharsetMetaAttributeValue,
|
||||||
|
Comment,
|
||||||
|
ContentMetaAttributeValue,
|
||||||
|
Doctype,
|
||||||
|
SoupStrainer,
|
||||||
|
)
|
||||||
|
|
||||||
|
from bs4.builder import HTMLParserTreeBuilder
|
||||||
|
default_builder = HTMLParserTreeBuilder
|
||||||
|
|
||||||
|
|
||||||
|
class SoupTest(unittest.TestCase):
|
||||||
|
|
||||||
|
@property
|
||||||
|
def default_builder(self):
|
||||||
|
return default_builder()
|
||||||
|
|
||||||
|
def soup(self, markup, **kwargs):
|
||||||
|
"""Build a Beautiful Soup object from markup."""
|
||||||
|
builder = kwargs.pop('builder', self.default_builder)
|
||||||
|
return BeautifulSoup(markup, builder=builder, **kwargs)
|
||||||
|
|
||||||
|
def document_for(self, markup):
|
||||||
|
"""Turn an HTML fragment into a document.
|
||||||
|
|
||||||
|
The details depend on the builder.
|
||||||
|
"""
|
||||||
|
return self.default_builder.test_fragment_to_document(markup)
|
||||||
|
|
||||||
|
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
|
||||||
|
builder = self.default_builder
|
||||||
|
obj = BeautifulSoup(to_parse, builder=builder)
|
||||||
|
if compare_parsed_to is None:
|
||||||
|
compare_parsed_to = to_parse
|
||||||
|
|
||||||
|
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLTreeBuilderSmokeTest(object):
|
||||||
|
|
||||||
|
"""A basic test of a treebuilder's competence.
|
||||||
|
|
||||||
|
Any HTML treebuilder, present or future, should be able to pass
|
||||||
|
these tests. With invalid markup, there's room for interpretation,
|
||||||
|
and different parsers can handle it differently. But with the
|
||||||
|
markup in these tests, there's not much room for interpretation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def assertDoctypeHandled(self, doctype_fragment):
|
||||||
|
"""Assert that a given doctype string is handled correctly."""
|
||||||
|
doctype_str, soup = self._document_with_doctype(doctype_fragment)
|
||||||
|
|
||||||
|
# Make sure a Doctype object was created.
|
||||||
|
doctype = soup.contents[0]
|
||||||
|
self.assertEqual(doctype.__class__, Doctype)
|
||||||
|
self.assertEqual(doctype, doctype_fragment)
|
||||||
|
self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
|
||||||
|
|
||||||
|
# Make sure that the doctype was correctly associated with the
|
||||||
|
# parse tree and that the rest of the document parsed.
|
||||||
|
self.assertEqual(soup.p.contents[0], 'foo')
|
||||||
|
|
||||||
|
def _document_with_doctype(self, doctype_fragment):
|
||||||
|
"""Generate and parse a document with the given doctype."""
|
||||||
|
doctype = '<!DOCTYPE %s>' % doctype_fragment
|
||||||
|
markup = doctype + '\n<p>foo</p>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
return doctype, soup
|
||||||
|
|
||||||
|
def test_normal_doctypes(self):
|
||||||
|
"""Make sure normal, everyday HTML doctypes are handled correctly."""
|
||||||
|
self.assertDoctypeHandled("html")
|
||||||
|
self.assertDoctypeHandled(
|
||||||
|
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
|
||||||
|
|
||||||
|
def test_public_doctype_with_url(self):
|
||||||
|
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
|
||||||
|
self.assertDoctypeHandled(doctype)
|
||||||
|
|
||||||
|
def test_system_doctype(self):
|
||||||
|
self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
|
||||||
|
|
||||||
|
def test_namespaced_system_doctype(self):
|
||||||
|
# We can handle a namespaced doctype with a system ID.
|
||||||
|
self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
|
||||||
|
|
||||||
|
def test_namespaced_public_doctype(self):
|
||||||
|
# Test a namespaced doctype with a public id.
|
||||||
|
self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
|
||||||
|
|
||||||
|
def test_real_xhtml_document(self):
|
||||||
|
"""A real XHTML document should come out more or less the same as it went in."""
|
||||||
|
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head><title>Hello.</title></head>
|
||||||
|
<body>Goodbye.</body>
|
||||||
|
</html>"""
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(
|
||||||
|
soup.encode("utf-8").replace(b"\n", b""),
|
||||||
|
markup.replace(b"\n", b""))
|
||||||
|
|
||||||
|
def test_deepcopy(self):
|
||||||
|
"""Make sure you can copy the tree builder.
|
||||||
|
|
||||||
|
This is important because the builder is part of a
|
||||||
|
BeautifulSoup object, and we want to be able to copy that.
|
||||||
|
"""
|
||||||
|
copy.deepcopy(self.default_builder)
|
||||||
|
|
||||||
|
def test_p_tag_is_never_empty_element(self):
|
||||||
|
"""A <p> tag is never designated as an empty-element tag.
|
||||||
|
|
||||||
|
Even if the markup shows it as an empty-element tag, it
|
||||||
|
shouldn't be presented that way.
|
||||||
|
"""
|
||||||
|
soup = self.soup("<p/>")
|
||||||
|
self.assertFalse(soup.p.is_empty_element)
|
||||||
|
self.assertEqual(str(soup.p), "<p></p>")
|
||||||
|
|
||||||
|
def test_unclosed_tags_get_closed(self):
|
||||||
|
"""A tag that's not closed by the end of the document should be closed.
|
||||||
|
|
||||||
|
This applies to all tags except empty-element tags.
|
||||||
|
"""
|
||||||
|
self.assertSoupEquals("<p>", "<p></p>")
|
||||||
|
self.assertSoupEquals("<b>", "<b></b>")
|
||||||
|
|
||||||
|
self.assertSoupEquals("<br>", "<br/>")
|
||||||
|
|
||||||
|
def test_br_is_always_empty_element_tag(self):
|
||||||
|
"""A <br> tag is designated as an empty-element tag.
|
||||||
|
|
||||||
|
Some parsers treat <br></br> as one <br/> tag, some parsers as
|
||||||
|
two tags, but it should always be an empty-element tag.
|
||||||
|
"""
|
||||||
|
soup = self.soup("<br></br>")
|
||||||
|
self.assertTrue(soup.br.is_empty_element)
|
||||||
|
self.assertEqual(str(soup.br), "<br/>")
|
||||||
|
|
||||||
|
def test_nested_formatting_elements(self):
|
||||||
|
self.assertSoupEquals("<em><em></em></em>")
|
||||||
|
|
||||||
|
def test_comment(self):
|
||||||
|
# Comments are represented as Comment objects.
|
||||||
|
markup = "<p>foo<!--foobar-->baz</p>"
|
||||||
|
self.assertSoupEquals(markup)
|
||||||
|
|
||||||
|
soup = self.soup(markup)
|
||||||
|
comment = soup.find(text="foobar")
|
||||||
|
self.assertEqual(comment.__class__, Comment)
|
||||||
|
|
||||||
|
# The comment is properly integrated into the tree.
|
||||||
|
foo = soup.find(text="foo")
|
||||||
|
self.assertEqual(comment, foo.next_element)
|
||||||
|
baz = soup.find(text="baz")
|
||||||
|
self.assertEquals(comment, baz.previous_element)
|
||||||
|
|
||||||
|
def test_preserved_whitespace_in_pre_and_textarea(self):
|
||||||
|
"""Whitespace must be preserved in <pre> and <textarea> tags."""
|
||||||
|
self.assertSoupEquals("<pre> </pre>")
|
||||||
|
self.assertSoupEquals("<textarea> woo </textarea>")
|
||||||
|
|
||||||
|
def test_nested_inline_elements(self):
|
||||||
|
"""Inline elements can be nested indefinitely."""
|
||||||
|
b_tag = "<b>Inside a B tag</b>"
|
||||||
|
self.assertSoupEquals(b_tag)
|
||||||
|
|
||||||
|
nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
|
||||||
|
self.assertSoupEquals(nested_b_tag)
|
||||||
|
|
||||||
|
double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
|
||||||
|
self.assertSoupEquals(nested_b_tag)
|
||||||
|
|
||||||
|
def test_nested_block_level_elements(self):
|
||||||
|
"""Block elements can be nested."""
|
||||||
|
soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
|
||||||
|
blockquote = soup.blockquote
|
||||||
|
self.assertEqual(blockquote.p.b.string, 'Foo')
|
||||||
|
self.assertEqual(blockquote.b.string, 'Foo')
|
||||||
|
|
||||||
|
def test_correctly_nested_tables(self):
|
||||||
|
"""One table can go inside another one."""
|
||||||
|
markup = ('<table id="1">'
|
||||||
|
'<tr>'
|
||||||
|
"<td>Here's another table:"
|
||||||
|
'<table id="2">'
|
||||||
|
'<tr><td>foo</td></tr>'
|
||||||
|
'</table></td>')
|
||||||
|
|
||||||
|
self.assertSoupEquals(
|
||||||
|
markup,
|
||||||
|
'<table id="1"><tr><td>Here\'s another table:'
|
||||||
|
'<table id="2"><tr><td>foo</td></tr></table>'
|
||||||
|
'</td></tr></table>')
|
||||||
|
|
||||||
|
self.assertSoupEquals(
|
||||||
|
"<table><thead><tr><td>Foo</td></tr></thead>"
|
||||||
|
"<tbody><tr><td>Bar</td></tr></tbody>"
|
||||||
|
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
||||||
|
|
||||||
|
def test_deeply_nested_multivalued_attribute(self):
|
||||||
|
# html5lib can set the attributes of the same tag many times
|
||||||
|
# as it rearranges the tree. This has caused problems with
|
||||||
|
# multivalued attributes.
|
||||||
|
markup = '<table><div><div class="css"></div></div></table>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(["css"], soup.div.div['class'])
|
||||||
|
|
||||||
|
def test_angle_brackets_in_attribute_values_are_escaped(self):
|
||||||
|
self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>')
|
||||||
|
|
||||||
|
def test_entities_in_attributes_converted_to_unicode(self):
|
||||||
|
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
||||||
|
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||||
|
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||||
|
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||||
|
|
||||||
|
def test_entities_in_text_converted_to_unicode(self):
|
||||||
|
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
||||||
|
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||||
|
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||||
|
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||||
|
|
||||||
|
def test_quot_entity_converted_to_quotation_mark(self):
|
||||||
|
self.assertSoupEquals("<p>I said "good day!"</p>",
|
||||||
|
'<p>I said "good day!"</p>')
|
||||||
|
|
||||||
|
def test_out_of_range_entity(self):
|
||||||
|
expect = u"\N{REPLACEMENT CHARACTER}"
|
||||||
|
self.assertSoupEquals("�", expect)
|
||||||
|
self.assertSoupEquals("�", expect)
|
||||||
|
self.assertSoupEquals("�", expect)
|
||||||
|
|
||||||
|
def test_basic_namespaces(self):
|
||||||
|
"""Parsers don't need to *understand* namespaces, but at the
|
||||||
|
very least they should not choke on namespaces or lose
|
||||||
|
data."""
|
||||||
|
|
||||||
|
markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(markup, soup.encode())
|
||||||
|
html = soup.html
|
||||||
|
self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
|
||||||
|
self.assertEqual(
|
||||||
|
'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
|
||||||
|
self.assertEqual(
|
||||||
|
'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
|
||||||
|
|
||||||
|
def test_multivalued_attribute_value_becomes_list(self):
|
||||||
|
markup = b'<a class="foo bar">'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(['foo', 'bar'], soup.a['class'])
|
||||||
|
|
||||||
|
#
|
||||||
|
# Generally speaking, tests below this point are more tests of
|
||||||
|
# Beautiful Soup than tests of the tree builders. But parsers are
|
||||||
|
# weird, so we run these tests separately for every tree builder
|
||||||
|
# to detect any differences between them.
|
||||||
|
#
|
||||||
|
|
||||||
|
def test_soupstrainer(self):
|
||||||
|
"""Parsers should be able to work with SoupStrainers."""
|
||||||
|
strainer = SoupStrainer("b")
|
||||||
|
soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
|
||||||
|
parse_only=strainer)
|
||||||
|
self.assertEqual(soup.decode(), "<b>bold</b>")
|
||||||
|
|
||||||
|
def test_single_quote_attribute_values_become_double_quotes(self):
|
||||||
|
self.assertSoupEquals("<foo attr='bar'></foo>",
|
||||||
|
'<foo attr="bar"></foo>')
|
||||||
|
|
||||||
|
def test_attribute_values_with_nested_quotes_are_left_alone(self):
|
||||||
|
text = """<foo attr='bar "brawls" happen'>a</foo>"""
|
||||||
|
self.assertSoupEquals(text)
|
||||||
|
|
||||||
|
def test_attribute_values_with_double_nested_quotes_get_quoted(self):
|
||||||
|
text = """<foo attr='bar "brawls" happen'>a</foo>"""
|
||||||
|
soup = self.soup(text)
|
||||||
|
soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
|
||||||
|
self.assertSoupEquals(
|
||||||
|
soup.foo.decode(),
|
||||||
|
"""<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""")
|
||||||
|
|
||||||
|
def test_ampersand_in_attribute_value_gets_escaped(self):
|
||||||
|
self.assertSoupEquals('<this is="really messed up & stuff"></this>',
|
||||||
|
'<this is="really messed up & stuff"></this>')
|
||||||
|
|
||||||
|
self.assertSoupEquals(
|
||||||
|
'<a href="http://example.org?a=1&b=2;3">foo</a>',
|
||||||
|
'<a href="http://example.org?a=1&b=2;3">foo</a>')
|
||||||
|
|
||||||
|
def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
|
||||||
|
self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>')
|
||||||
|
|
||||||
|
def test_entities_in_strings_converted_during_parsing(self):
|
||||||
|
# Both XML and HTML entities are converted to Unicode characters
|
||||||
|
# during parsing.
|
||||||
|
text = "<p><<sacré bleu!>></p>"
|
||||||
|
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
|
||||||
|
self.assertSoupEquals(text, expected)
|
||||||
|
|
||||||
|
def test_smart_quotes_converted_on_the_way_in(self):
|
||||||
|
# Microsoft smart quotes are converted to Unicode characters during
|
||||||
|
# parsing.
|
||||||
|
quote = b"<p>\x91Foo\x92</p>"
|
||||||
|
soup = self.soup(quote)
|
||||||
|
self.assertEqual(
|
||||||
|
soup.p.string,
|
||||||
|
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
|
||||||
|
|
||||||
|
def test_non_breaking_spaces_converted_on_the_way_in(self):
|
||||||
|
soup = self.soup("<a> </a>")
|
||||||
|
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
|
||||||
|
|
||||||
|
def test_entities_converted_on_the_way_out(self):
|
||||||
|
text = "<p><<sacré bleu!>></p>"
|
||||||
|
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
|
||||||
|
soup = self.soup(text)
|
||||||
|
self.assertEqual(soup.p.encode("utf-8"), expected)
|
||||||
|
|
||||||
|
def test_real_iso_latin_document(self):
|
||||||
|
# Smoke test of interrelated functionality, using an
|
||||||
|
# easy-to-understand document.
|
||||||
|
|
||||||
|
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
|
||||||
|
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
|
||||||
|
|
||||||
|
# That's because we're going to encode it into ISO-Latin-1, and use
|
||||||
|
# that to test.
|
||||||
|
iso_latin_html = unicode_html.encode("iso-8859-1")
|
||||||
|
|
||||||
|
# Parse the ISO-Latin-1 HTML.
|
||||||
|
soup = self.soup(iso_latin_html)
|
||||||
|
# Encode it to UTF-8.
|
||||||
|
result = soup.encode("utf-8")
|
||||||
|
|
||||||
|
# What do we expect the result to look like? Well, it would
|
||||||
|
# look like unicode_html, except that the META tag would say
|
||||||
|
# UTF-8 instead of ISO-Latin-1.
|
||||||
|
expected = unicode_html.replace("ISO-Latin-1", "utf-8")
|
||||||
|
|
||||||
|
# And, of course, it would be in UTF-8, not Unicode.
|
||||||
|
expected = expected.encode("utf-8")
|
||||||
|
|
||||||
|
# Ta-da!
|
||||||
|
self.assertEqual(result, expected)
|
||||||
|
|
||||||
|
def test_real_shift_jis_document(self):
|
||||||
|
# Smoke test to make sure the parser can handle a document in
|
||||||
|
# Shift-JIS encoding, without choking.
|
||||||
|
shift_jis_html = (
|
||||||
|
b'<html><head></head><body><pre>'
|
||||||
|
b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
|
||||||
|
b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
|
||||||
|
b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
|
||||||
|
b'</pre></body></html>')
|
||||||
|
unicode_html = shift_jis_html.decode("shift-jis")
|
||||||
|
soup = self.soup(unicode_html)
|
||||||
|
|
||||||
|
# Make sure the parse tree is correctly encoded to various
|
||||||
|
# encodings.
|
||||||
|
self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
|
||||||
|
self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
|
||||||
|
|
||||||
|
def test_real_hebrew_document(self):
|
||||||
|
# A real-world test to make sure we can convert ISO-8859-9 (a
|
||||||
|
# Hebrew encoding) to UTF-8.
|
||||||
|
hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
|
||||||
|
soup = self.soup(
|
||||||
|
hebrew_document, from_encoding="iso8859-8")
|
||||||
|
self.assertEqual(soup.original_encoding, 'iso8859-8')
|
||||||
|
self.assertEqual(
|
||||||
|
soup.encode('utf-8'),
|
||||||
|
hebrew_document.decode("iso8859-8").encode("utf-8"))
|
||||||
|
|
||||||
|
def test_meta_tag_reflects_current_encoding(self):
|
||||||
|
# Here's the <meta> tag saying that a document is
|
||||||
|
# encoded in Shift-JIS.
|
||||||
|
meta_tag = ('<meta content="text/html; charset=x-sjis" '
|
||||||
|
'http-equiv="Content-type"/>')
|
||||||
|
|
||||||
|
# Here's a document incorporating that meta tag.
|
||||||
|
shift_jis_html = (
|
||||||
|
'<html><head>\n%s\n'
|
||||||
|
'<meta http-equiv="Content-language" content="ja"/>'
|
||||||
|
'</head><body>Shift-JIS markup goes here.') % meta_tag
|
||||||
|
soup = self.soup(shift_jis_html)
|
||||||
|
|
||||||
|
# Parse the document, and the charset is seemingly unaffected.
|
||||||
|
parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
|
||||||
|
content = parsed_meta['content']
|
||||||
|
self.assertEqual('text/html; charset=x-sjis', content)
|
||||||
|
|
||||||
|
# But that value is actually a ContentMetaAttributeValue object.
|
||||||
|
self.assertTrue(isinstance(content, ContentMetaAttributeValue))
|
||||||
|
|
||||||
|
# And it will take on a value that reflects its current
|
||||||
|
# encoding.
|
||||||
|
self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
|
||||||
|
|
||||||
|
# For the rest of the story, see TestSubstitutions in
|
||||||
|
# test_tree.py.
|
||||||
|
|
||||||
|
def test_html5_style_meta_tag_reflects_current_encoding(self):
|
||||||
|
# Here's the <meta> tag saying that a document is
|
||||||
|
# encoded in Shift-JIS.
|
||||||
|
meta_tag = ('<meta id="encoding" charset="x-sjis" />')
|
||||||
|
|
||||||
|
# Here's a document incorporating that meta tag.
|
||||||
|
shift_jis_html = (
|
||||||
|
'<html><head>\n%s\n'
|
||||||
|
'<meta http-equiv="Content-language" content="ja"/>'
|
||||||
|
'</head><body>Shift-JIS markup goes here.') % meta_tag
|
||||||
|
soup = self.soup(shift_jis_html)
|
||||||
|
|
||||||
|
# Parse the document, and the charset is seemingly unaffected.
|
||||||
|
parsed_meta = soup.find('meta', id="encoding")
|
||||||
|
charset = parsed_meta['charset']
|
||||||
|
self.assertEqual('x-sjis', charset)
|
||||||
|
|
||||||
|
# But that value is actually a CharsetMetaAttributeValue object.
|
||||||
|
self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
|
||||||
|
|
||||||
|
# And it will take on a value that reflects its current
|
||||||
|
# encoding.
|
||||||
|
self.assertEqual('utf8', charset.encode("utf8"))
|
||||||
|
|
||||||
|
def test_tag_with_no_attributes_can_have_attributes_added(self):
|
||||||
|
data = self.soup("<a>text</a>")
|
||||||
|
data.a['foo'] = 'bar'
|
||||||
|
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
|
||||||
|
|
||||||
|
class XMLTreeBuilderSmokeTest(object):
|
||||||
|
|
||||||
|
def test_docstring_generated(self):
|
||||||
|
soup = self.soup("<root/>")
|
||||||
|
self.assertEqual(
|
||||||
|
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
|
||||||
|
|
||||||
|
def test_real_xhtml_document(self):
|
||||||
|
"""A real XHTML document should come out *exactly* the same as it went in."""
|
||||||
|
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head><title>Hello.</title></head>
|
||||||
|
<body>Goodbye.</body>
|
||||||
|
</html>"""
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(
|
||||||
|
soup.encode("utf-8"), markup)
|
||||||
|
|
||||||
|
def test_popping_namespaced_tag(self):
|
||||||
|
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(
|
||||||
|
unicode(soup.rss), markup)
|
||||||
|
|
||||||
|
def test_docstring_includes_correct_encoding(self):
|
||||||
|
soup = self.soup("<root/>")
|
||||||
|
self.assertEqual(
|
||||||
|
soup.encode("latin1"),
|
||||||
|
b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
|
||||||
|
|
||||||
|
def test_large_xml_document(self):
|
||||||
|
"""A large XML document should come out the same as it went in."""
|
||||||
|
markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
|
||||||
|
+ b'0' * (2**12)
|
||||||
|
+ b'</root>')
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(soup.encode("utf-8"), markup)
|
||||||
|
|
||||||
|
|
||||||
|
def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
|
||||||
|
self.assertSoupEquals("<p>", "<p/>")
|
||||||
|
self.assertSoupEquals("<p>foo</p>")
|
||||||
|
|
||||||
|
def test_namespaces_are_preserved(self):
|
||||||
|
markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
root = soup.root
|
||||||
|
self.assertEqual("http://example.com/", root['xmlns:a'])
|
||||||
|
self.assertEqual("http://example.net/", root['xmlns:b'])
|
||||||
|
|
||||||
|
def test_closing_namespaced_tag(self):
|
||||||
|
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(unicode(soup.p), markup)
|
||||||
|
|
||||||
|
def test_namespaced_attributes(self):
|
||||||
|
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(unicode(soup.foo), markup)
|
||||||
|
|
||||||
|
def test_namespaced_attributes_xml_namespace(self):
|
||||||
|
markup = '<foo xml:lang="fr">bar</foo>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(unicode(soup.foo), markup)
|
||||||
|
|
||||||
|
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
||||||
|
"""Smoke test for a tree builder that supports HTML5."""
|
||||||
|
|
||||||
|
def test_real_xhtml_document(self):
|
||||||
|
# Since XHTML is not HTML5, HTML5 parsers are not tested to handle
|
||||||
|
# XHTML documents in any particular way.
|
||||||
|
pass
|
||||||
|
|
||||||
|
def test_html_tags_have_namespace(self):
|
||||||
|
markup = "<a>"
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
|
||||||
|
|
||||||
|
def test_svg_tags_have_namespace(self):
|
||||||
|
markup = '<svg><circle/></svg>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
namespace = "http://www.w3.org/2000/svg"
|
||||||
|
self.assertEqual(namespace, soup.svg.namespace)
|
||||||
|
self.assertEqual(namespace, soup.circle.namespace)
|
||||||
|
|
||||||
|
|
||||||
|
def test_mathml_tags_have_namespace(self):
|
||||||
|
markup = '<math><msqrt>5</msqrt></math>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
namespace = 'http://www.w3.org/1998/Math/MathML'
|
||||||
|
self.assertEqual(namespace, soup.math.namespace)
|
||||||
|
self.assertEqual(namespace, soup.msqrt.namespace)
|
||||||
|
|
||||||
|
def test_xml_declaration_becomes_comment(self):
|
||||||
|
markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertTrue(isinstance(soup.contents[0], Comment))
|
||||||
|
self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
|
||||||
|
self.assertEqual("html", soup.contents[0].next_element.name)
|
||||||
|
|
||||||
|
def skipIf(condition, reason):
|
||||||
|
def nothing(test, *args, **kwargs):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def decorator(test_item):
|
||||||
|
if condition:
|
||||||
|
return nothing
|
||||||
|
else:
|
||||||
|
return test_item
|
||||||
|
|
||||||
|
return decorator
|
@ -0,0 +1 @@
|
|||||||
|
"The beautifulsoup tests."
|
@ -0,0 +1,141 @@
|
|||||||
|
"""Tests of the builder registry."""
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4.builder import (
|
||||||
|
builder_registry as registry,
|
||||||
|
HTMLParserTreeBuilder,
|
||||||
|
TreeBuilderRegistry,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from bs4.builder import HTML5TreeBuilder
|
||||||
|
HTML5LIB_PRESENT = True
|
||||||
|
except ImportError:
|
||||||
|
HTML5LIB_PRESENT = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
from bs4.builder import (
|
||||||
|
LXMLTreeBuilderForXML,
|
||||||
|
LXMLTreeBuilder,
|
||||||
|
)
|
||||||
|
LXML_PRESENT = True
|
||||||
|
except ImportError:
|
||||||
|
LXML_PRESENT = False
|
||||||
|
|
||||||
|
|
||||||
|
class BuiltInRegistryTest(unittest.TestCase):
|
||||||
|
"""Test the built-in registry with the default builders registered."""
|
||||||
|
|
||||||
|
def test_combination(self):
|
||||||
|
if LXML_PRESENT:
|
||||||
|
self.assertEqual(registry.lookup('fast', 'html'),
|
||||||
|
LXMLTreeBuilder)
|
||||||
|
|
||||||
|
if LXML_PRESENT:
|
||||||
|
self.assertEqual(registry.lookup('permissive', 'xml'),
|
||||||
|
LXMLTreeBuilderForXML)
|
||||||
|
self.assertEqual(registry.lookup('strict', 'html'),
|
||||||
|
HTMLParserTreeBuilder)
|
||||||
|
if HTML5LIB_PRESENT:
|
||||||
|
self.assertEqual(registry.lookup('html5lib', 'html'),
|
||||||
|
HTML5TreeBuilder)
|
||||||
|
|
||||||
|
def test_lookup_by_markup_type(self):
|
||||||
|
if LXML_PRESENT:
|
||||||
|
self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
|
||||||
|
self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
|
||||||
|
else:
|
||||||
|
self.assertEqual(registry.lookup('xml'), None)
|
||||||
|
if HTML5LIB_PRESENT:
|
||||||
|
self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
|
||||||
|
else:
|
||||||
|
self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
|
||||||
|
|
||||||
|
def test_named_library(self):
|
||||||
|
if LXML_PRESENT:
|
||||||
|
self.assertEqual(registry.lookup('lxml', 'xml'),
|
||||||
|
LXMLTreeBuilderForXML)
|
||||||
|
self.assertEqual(registry.lookup('lxml', 'html'),
|
||||||
|
LXMLTreeBuilder)
|
||||||
|
if HTML5LIB_PRESENT:
|
||||||
|
self.assertEqual(registry.lookup('html5lib'),
|
||||||
|
HTML5TreeBuilder)
|
||||||
|
|
||||||
|
self.assertEqual(registry.lookup('html.parser'),
|
||||||
|
HTMLParserTreeBuilder)
|
||||||
|
|
||||||
|
def test_beautifulsoup_constructor_does_lookup(self):
|
||||||
|
# You can pass in a string.
|
||||||
|
BeautifulSoup("", features="html")
|
||||||
|
# Or a list of strings.
|
||||||
|
BeautifulSoup("", features=["html", "fast"])
|
||||||
|
|
||||||
|
# You'll get an exception if BS can't find an appropriate
|
||||||
|
# builder.
|
||||||
|
self.assertRaises(ValueError, BeautifulSoup,
|
||||||
|
"", features="no-such-feature")
|
||||||
|
|
||||||
|
class RegistryTest(unittest.TestCase):
|
||||||
|
"""Test the TreeBuilderRegistry class in general."""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.registry = TreeBuilderRegistry()
|
||||||
|
|
||||||
|
def builder_for_features(self, *feature_list):
|
||||||
|
cls = type('Builder_' + '_'.join(feature_list),
|
||||||
|
(object,), {'features' : feature_list})
|
||||||
|
|
||||||
|
self.registry.register(cls)
|
||||||
|
return cls
|
||||||
|
|
||||||
|
def test_register_with_no_features(self):
|
||||||
|
builder = self.builder_for_features()
|
||||||
|
|
||||||
|
# Since the builder advertises no features, you can't find it
|
||||||
|
# by looking up features.
|
||||||
|
self.assertEqual(self.registry.lookup('foo'), None)
|
||||||
|
|
||||||
|
# But you can find it by doing a lookup with no features, if
|
||||||
|
# this happens to be the only registered builder.
|
||||||
|
self.assertEqual(self.registry.lookup(), builder)
|
||||||
|
|
||||||
|
def test_register_with_features_makes_lookup_succeed(self):
|
||||||
|
builder = self.builder_for_features('foo', 'bar')
|
||||||
|
self.assertEqual(self.registry.lookup('foo'), builder)
|
||||||
|
self.assertEqual(self.registry.lookup('bar'), builder)
|
||||||
|
|
||||||
|
def test_lookup_fails_when_no_builder_implements_feature(self):
|
||||||
|
builder = self.builder_for_features('foo', 'bar')
|
||||||
|
self.assertEqual(self.registry.lookup('baz'), None)
|
||||||
|
|
||||||
|
def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
|
||||||
|
builder1 = self.builder_for_features('foo')
|
||||||
|
builder2 = self.builder_for_features('bar')
|
||||||
|
self.assertEqual(self.registry.lookup(), builder2)
|
||||||
|
|
||||||
|
def test_lookup_fails_when_no_tree_builders_registered(self):
|
||||||
|
self.assertEqual(self.registry.lookup(), None)
|
||||||
|
|
||||||
|
def test_lookup_gets_most_recent_builder_supporting_all_features(self):
|
||||||
|
has_one = self.builder_for_features('foo')
|
||||||
|
has_the_other = self.builder_for_features('bar')
|
||||||
|
has_both_early = self.builder_for_features('foo', 'bar', 'baz')
|
||||||
|
has_both_late = self.builder_for_features('foo', 'bar', 'quux')
|
||||||
|
lacks_one = self.builder_for_features('bar')
|
||||||
|
has_the_other = self.builder_for_features('foo')
|
||||||
|
|
||||||
|
# There are two builders featuring 'foo' and 'bar', but
|
||||||
|
# the one that also features 'quux' was registered later.
|
||||||
|
self.assertEqual(self.registry.lookup('foo', 'bar'),
|
||||||
|
has_both_late)
|
||||||
|
|
||||||
|
# There is only one builder featuring 'foo', 'bar', and 'baz'.
|
||||||
|
self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
|
||||||
|
has_both_early)
|
||||||
|
|
||||||
|
def test_lookup_fails_when_cannot_reconcile_requested_features(self):
|
||||||
|
builder1 = self.builder_for_features('foo', 'bar')
|
||||||
|
builder2 = self.builder_for_features('foo', 'baz')
|
||||||
|
self.assertEqual(self.registry.lookup('bar', 'baz'), None)
|
@ -0,0 +1,36 @@
|
|||||||
|
"Test harness for doctests."
|
||||||
|
|
||||||
|
# pylint: disable-msg=E0611,W0142
|
||||||
|
|
||||||
|
__metaclass__ = type
|
||||||
|
__all__ = [
|
||||||
|
'additional_tests',
|
||||||
|
]
|
||||||
|
|
||||||
|
import atexit
|
||||||
|
import doctest
|
||||||
|
import os
|
||||||
|
#from pkg_resources import (
|
||||||
|
# resource_filename, resource_exists, resource_listdir, cleanup_resources)
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
DOCTEST_FLAGS = (
|
||||||
|
doctest.ELLIPSIS |
|
||||||
|
doctest.NORMALIZE_WHITESPACE |
|
||||||
|
doctest.REPORT_NDIFF)
|
||||||
|
|
||||||
|
|
||||||
|
# def additional_tests():
|
||||||
|
# "Run the doc tests (README.txt and docs/*, if any exist)"
|
||||||
|
# doctest_files = [
|
||||||
|
# os.path.abspath(resource_filename('bs4', 'README.txt'))]
|
||||||
|
# if resource_exists('bs4', 'docs'):
|
||||||
|
# for name in resource_listdir('bs4', 'docs'):
|
||||||
|
# if name.endswith('.txt'):
|
||||||
|
# doctest_files.append(
|
||||||
|
# os.path.abspath(
|
||||||
|
# resource_filename('bs4', 'docs/%s' % name)))
|
||||||
|
# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
|
||||||
|
# atexit.register(cleanup_resources)
|
||||||
|
# return unittest.TestSuite((
|
||||||
|
# doctest.DocFileSuite(*doctest_files, **kwargs)))
|
@ -0,0 +1,72 @@
|
|||||||
|
"""Tests to ensure that the html5lib tree builder generates good trees."""
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
try:
|
||||||
|
from bs4.builder import HTML5TreeBuilder
|
||||||
|
HTML5LIB_PRESENT = True
|
||||||
|
except ImportError, e:
|
||||||
|
HTML5LIB_PRESENT = False
|
||||||
|
from bs4.element import SoupStrainer
|
||||||
|
from bs4.testing import (
|
||||||
|
HTML5TreeBuilderSmokeTest,
|
||||||
|
SoupTest,
|
||||||
|
skipIf,
|
||||||
|
)
|
||||||
|
|
||||||
|
@skipIf(
|
||||||
|
not HTML5LIB_PRESENT,
|
||||||
|
"html5lib seems not to be present, not testing its tree builder.")
|
||||||
|
class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||||
|
"""See ``HTML5TreeBuilderSmokeTest``."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def default_builder(self):
|
||||||
|
return HTML5TreeBuilder()
|
||||||
|
|
||||||
|
def test_soupstrainer(self):
|
||||||
|
# The html5lib tree builder does not support SoupStrainers.
|
||||||
|
strainer = SoupStrainer("b")
|
||||||
|
markup = "<p>A <b>bold</b> statement.</p>"
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
soup = self.soup(markup, parse_only=strainer)
|
||||||
|
self.assertEqual(
|
||||||
|
soup.decode(), self.document_for(markup))
|
||||||
|
|
||||||
|
self.assertTrue(
|
||||||
|
"the html5lib tree builder doesn't support parse_only" in
|
||||||
|
str(w[0].message))
|
||||||
|
|
||||||
|
def test_correctly_nested_tables(self):
|
||||||
|
"""html5lib inserts <tbody> tags where other parsers don't."""
|
||||||
|
markup = ('<table id="1">'
|
||||||
|
'<tr>'
|
||||||
|
"<td>Here's another table:"
|
||||||
|
'<table id="2">'
|
||||||
|
'<tr><td>foo</td></tr>'
|
||||||
|
'</table></td>')
|
||||||
|
|
||||||
|
self.assertSoupEquals(
|
||||||
|
markup,
|
||||||
|
'<table id="1"><tbody><tr><td>Here\'s another table:'
|
||||||
|
'<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
|
||||||
|
'</td></tr></tbody></table>')
|
||||||
|
|
||||||
|
self.assertSoupEquals(
|
||||||
|
"<table><thead><tr><td>Foo</td></tr></thead>"
|
||||||
|
"<tbody><tr><td>Bar</td></tr></tbody>"
|
||||||
|
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
||||||
|
|
||||||
|
def test_xml_declaration_followed_by_doctype(self):
|
||||||
|
markup = '''<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>foo</p>
|
||||||
|
</body>
|
||||||
|
</html>'''
|
||||||
|
soup = self.soup(markup)
|
||||||
|
# Verify that we can reach the <p> tag; this means the tree is connected.
|
||||||
|
self.assertEquals("<p>foo</p>", soup.p.encode())
|
@ -0,0 +1,19 @@
|
|||||||
|
"""Tests to ensure that the html.parser tree builder generates good
|
||||||
|
trees."""
|
||||||
|
|
||||||
|
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
|
||||||
|
from bs4.builder import HTMLParserTreeBuilder
|
||||||
|
|
||||||
|
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
|
|
||||||
|
@property
|
||||||
|
def default_builder(self):
|
||||||
|
return HTMLParserTreeBuilder()
|
||||||
|
|
||||||
|
def test_namespaced_system_doctype(self):
|
||||||
|
# html.parser can't handle namespaced doctypes, so skip this one.
|
||||||
|
pass
|
||||||
|
|
||||||
|
def test_namespaced_public_doctype(self):
|
||||||
|
# html.parser can't handle namespaced doctypes, so skip this one.
|
||||||
|
pass
|
@ -0,0 +1,75 @@
|
|||||||
|
"""Tests to ensure that the lxml tree builder generates good trees."""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
try:
|
||||||
|
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
||||||
|
LXML_PRESENT = True
|
||||||
|
except ImportError, e:
|
||||||
|
LXML_PRESENT = False
|
||||||
|
|
||||||
|
from bs4 import (
|
||||||
|
BeautifulSoup,
|
||||||
|
BeautifulStoneSoup,
|
||||||
|
)
|
||||||
|
from bs4.element import Comment, Doctype, SoupStrainer
|
||||||
|
from bs4.testing import skipIf
|
||||||
|
from bs4.tests import test_htmlparser
|
||||||
|
from bs4.testing import (
|
||||||
|
HTMLTreeBuilderSmokeTest,
|
||||||
|
XMLTreeBuilderSmokeTest,
|
||||||
|
SoupTest,
|
||||||
|
skipIf,
|
||||||
|
)
|
||||||
|
|
||||||
|
@skipIf(
|
||||||
|
not LXML_PRESENT,
|
||||||
|
"lxml seems not to be present, not testing its tree builder.")
|
||||||
|
class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
|
"""See ``HTMLTreeBuilderSmokeTest``."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def default_builder(self):
|
||||||
|
return LXMLTreeBuilder()
|
||||||
|
|
||||||
|
def test_out_of_range_entity(self):
|
||||||
|
self.assertSoupEquals(
|
||||||
|
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||||
|
self.assertSoupEquals(
|
||||||
|
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||||
|
self.assertSoupEquals(
|
||||||
|
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||||
|
|
||||||
|
def test_beautifulstonesoup_is_xml_parser(self):
|
||||||
|
# Make sure that the deprecated BSS class uses an xml builder
|
||||||
|
# if one is installed.
|
||||||
|
with warnings.catch_warnings(record=False) as w:
|
||||||
|
soup = BeautifulStoneSoup("<b />")
|
||||||
|
self.assertEqual(u"<b/>", unicode(soup.b))
|
||||||
|
|
||||||
|
def test_real_xhtml_document(self):
|
||||||
|
"""lxml strips the XML definition from an XHTML doc, which is fine."""
|
||||||
|
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head><title>Hello.</title></head>
|
||||||
|
<body>Goodbye.</body>
|
||||||
|
</html>"""
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(
|
||||||
|
soup.encode("utf-8").replace(b"\n", b''),
|
||||||
|
markup.replace(b'\n', b'').replace(
|
||||||
|
b'<?xml version="1.0" encoding="utf-8"?>', b''))
|
||||||
|
|
||||||
|
|
||||||
|
@skipIf(
|
||||||
|
not LXML_PRESENT,
|
||||||
|
"lxml seems not to be present, not testing its XML tree builder.")
|
||||||
|
class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
|
||||||
|
"""See ``HTMLTreeBuilderSmokeTest``."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def default_builder(self):
|
||||||
|
return LXMLTreeBuilderForXML()
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue