# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
from collections.abc import Callable # Python 3.6
from collections import Callable
from bs4.dammit import EntitySubstitution
DEFAULT_OUTPUT_ENCODING = "utf-8"
PY3K = (sys.version_info[0] > 2)
whitespace_re = re.compile(r"\s+")
"""Alias one attribute name to another for backward compatibility"""
return getattr(self, attr)
return setattr(self, attr)
class NamespacedAttribute(str):
def __new__(cls, prefix, name, namespace=None):
obj = str.__new__(cls, prefix)
obj = str.__new__(cls, name)
obj = str.__new__(cls, prefix + ":" + name)
obj.namespace = namespace
class AttributeValueWithCharsetSubstitution(str):
"""A stand-in object for a character encoding specified in HTML."""
class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
"""A generic stand-in for the value of a meta tag's 'charset' attribute.
When Beautiful Soup parses the markup '<meta charset="utf8">', the
value of the 'charset' attribute will be one of these objects.
def __new__(cls, original_value):
obj = str.__new__(cls, original_value)
obj.original_value = original_value
def encode(self, encoding):
class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
"""A generic stand-in for the value of a meta tag's 'content' attribute.
When Beautiful Soup parses the markup:
<meta http-equiv="content-type" content="text/html; charset=utf8">
The value of the 'content' attribute will be one of these objects.
CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
def __new__(cls, original_value):
match = cls.CHARSET_RE.search(original_value)
# No substitution necessary.
return str.__new__(str, original_value)
obj = str.__new__(cls, original_value)
obj.original_value = original_value
def encode(self, encoding):
return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value)
class HTMLAwareEntitySubstitution(EntitySubstitution):
"""Entity substitution rules that are aware of some HTML quirks.
Specifically, the contents of <script> and <style> tags should not
undergo entity substitution.
Incoming NavigableString objects are checked to see if they're the
direct children of a <script> or <style> tag.
cdata_containing_tags = set(["script", "style"])
preformatted_tags = set(["pre"])
preserve_whitespace_tags = set(['pre', 'textarea'])
def _substitute_if_appropriate(cls, ns, f):
if (isinstance(ns, NavigableString)
and ns.parent is not None
and ns.parent.name in cls.cdata_containing_tags):
def substitute_html(cls, ns):
return cls._substitute_if_appropriate(
ns, EntitySubstitution.substitute_html)
def substitute_xml(cls, ns):
return cls._substitute_if_appropriate(
ns, EntitySubstitution.substitute_xml)
"""Contains information about how to format a parse tree."""
# By default, represent void elements as <tag/> rather than <tag>
void_element_close_prefix = '/'
def substitute_entities(self, *args, **kwargs):
"""Transform certain characters into named entities."""
raise NotImplementedError()
class HTMLFormatter(Formatter):
"""The default HTML formatter."""
def substitute(self, *args, **kwargs):
return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
class MinimalHTMLFormatter(Formatter):
"""A minimal HTML formatter."""
def substitute(self, *args, **kwargs):
return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
class HTML5Formatter(HTMLFormatter):
"""An HTML formatter that omits the slash in a void tag."""
void_element_close_prefix = None
class XMLFormatter(Formatter):
"""Substitute only the essential XML entities."""
def substitute(self, *args, **kwargs):
return EntitySubstitution.substitute_xml(*args, **kwargs)
class HTMLXMLFormatter(Formatter):
"""Format XML using HTML rules."""
def substitute(self, *args, **kwargs):
return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
# There are five possible values for the "formatter" argument passed in
# to methods like encode() and prettify():
# "html" - All Unicode characters with corresponding HTML entities
# are converted to those entities on output.
# "html5" - The same as "html", but empty void tags are represented as
# <tag> rather than <tag/>
# "minimal" - Bare ampersands and angle brackets are converted to
# XML entities: & < >
# None - The null formatter. Unicode characters are never
# converted to entities. This is not recommended, but it's
# A callable function - it will be called on every string that needs to undergo entity substitution.
# A Formatter instance - Formatter.substitute(string) will be called on every string that
# needs to undergo entity substitution.
# In an HTML document, the default "html", "html5", and "minimal"
# functions will leave the contents of <script> and <style> tags
# alone. For an XML document, all tags will be given the same
"html" : HTMLFormatter(),
"html5" : HTML5Formatter(),
"minimal" : MinimalHTMLFormatter(),
"html" : HTMLXMLFormatter(),
"minimal" : XMLFormatter(),
def format_string(self, s, formatter='minimal'):
"""Format the given string using the given formatter."""
if isinstance(formatter, str):
formatter = self._formatter_for_name(formatter)
# Backwards compatibility -- you used to pass in a formatting method.
output = formatter.substitute(s)
"""Is this element part of an XML tree or an HTML tree?
This is used when mapping a formatter name ("minimal") to an
appropriate function (one that performs entity-substitution on
the contents of <script> and <style> tags, or not). It can be
inefficient, but it should be called very rarely.
if self.known_xml is not None:
# Most of the time we will have determined this when the
# Otherwise, it's likely that this element was created by
# direct invocation of the constructor from within the user's
# This is the top-level object. It should have .known_xml set
# from tree creation. If not, take a guess--BS is usually
return getattr(self, 'is_xml', False)
return self.parent._is_xml
def _formatter_for_name(self, name):
"Look up a formatter function based on its name and the tree."
return self.XML_FORMATTERS.get(name, XMLFormatter())
return self.HTML_FORMATTERS.get(name, HTMLFormatter())
def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None):
"""Sets up the initial relations between this element and
self.previous_element = previous_element
if previous_element is not None:
self.previous_element.next_element = self
self.next_element = next_element
self.next_element.previous_element = self
self.next_sibling = next_sibling
self.next_sibling.previous_sibling = self
and self.parent is not None and self.parent.contents):
previous_sibling = self.parent.contents[-1]
self.previous_sibling = previous_sibling
self.previous_sibling.next_sibling = self
nextSibling = _alias("next_sibling") # BS3
previousSibling = _alias("previous_sibling") # BS3
def replace_with(self, replace_with):
"Cannot replace one element with another when the"
"element to be replaced is not part of a tree.")
if replace_with is self.parent:
raise ValueError("Cannot replace a Tag with its parent.")
my_index = self.parent.index(self)
old_parent.insert(my_index, replace_with)
replaceWith = replace_with # BS3
"Cannot replace an element with its contents when that"
"element is not part of a tree.")
my_index = self.parent.index(self)
for child in reversed(self.contents[:]):
my_parent.insert(my_index, child)
replace_with_children = unwrap
replaceWithChildren = unwrap # BS3
def wrap(self, wrap_inside):
me = self.replace_with(wrap_inside)
"""Destructively rips this element out of the tree."""
if self.parent is not None:
del self.parent.contents[self.parent.index(self)]
#Find the two elements that would be next to each other if
#this element (and any children) hadn't been parsed. Connect
last_child = self._last_descendant()
next_element = last_child.next_element
if (self.previous_element is not None and
self.previous_element is not next_element):
self.previous_element.next_element = next_element
if next_element is not None and next_element is not self.previous_element:
next_element.previous_element = self.previous_element
self.previous_element = None
last_child.next_element = None
if (self.previous_sibling is not None
and self.previous_sibling is not self.next_sibling):
self.previous_sibling.next_sibling = self.next_sibling
if (self.next_sibling is not None
and self.next_sibling is not self.previous_sibling):
self.next_sibling.previous_sibling = self.previous_sibling
self.previous_sibling = self.next_sibling = None
def _last_descendant(self, is_initialized=True, accept_self=True):
"Finds the last element beneath this object to be parsed."
if is_initialized and self.next_sibling:
last_child = self.next_sibling.previous_element
while isinstance(last_child, Tag) and last_child.contents:
last_child = last_child.contents[-1]
if not accept_self and last_child is self:
# BS3: Not part of the API!
_lastRecursiveChild = _last_descendant
def insert(self, position, new_child):
raise ValueError("Cannot insert None into a tag.")
raise ValueError("Cannot insert a tag into itself.")
if (isinstance(new_child, str)
and not isinstance(new_child, NavigableString)):
new_child = NavigableString(new_child)
from bs4 import BeautifulSoup
if isinstance(new_child, BeautifulSoup):
# We don't want to end up with a situation where one BeautifulSoup
# object contains another. Insert the children one at a time.
for subchild in list(new_child.contents):
self.insert(position, subchild)
position = min(position, len(self.contents))
if hasattr(new_child, 'parent') and new_child.parent is not None:
# We're 'inserting' an element that's already one
# of this object's children.
if new_child.parent is self:
current_index = self.index(new_child)
if current_index < position:
# We're moving this element further down the list
# of this object's children. That means that when
# we extract this element, our target index will
new_child.previous_sibling = None
new_child.previous_element = self
previous_child = self.contents[position - 1]
new_child.previous_sibling = previous_child
new_child.previous_sibling.next_sibling = new_child
new_child.previous_element = previous_child._last_descendant(False)
if new_child.previous_element is not None:
new_child.previous_element.next_element = new_child
new_childs_last_element = new_child._last_descendant(False)
if position >= len(self.contents):
new_child.next_sibling = None
parents_next_sibling = None
while parents_next_sibling is None and parent is not None:
parents_next_sibling = parent.next_sibling
if parents_next_sibling is not None:
# We found the element that comes next in the document.
if parents_next_sibling is not None:
new_childs_last_element.next_element = parents_next_sibling
# The last element of this tag is the last element in
new_childs_last_element.next_element = None
next_child = self.contents[position]
new_child.next_sibling = next_child
if new_child.next_sibling is not None:
new_child.next_sibling.previous_sibling = new_child
new_childs_last_element.next_element = next_child
if new_childs_last_element.next_element is not None:
new_childs_last_element.next_element.previous_element = new_childs_last_element
self.contents.insert(position, new_child)
"""Appends the given tag to the contents of this tag."""
self.insert(len(self.contents), tag)
def insert_before(self, predecessor):
"""Makes the given element the immediate predecessor of this one.
The two elements will have the same parent, and the given element
will be immediately before this one.
raise ValueError("Can't insert an element before itself.")
"Element has no parent, so 'before' has no meaning.")
# Extract first so that the index won't be screwed up if they
if isinstance(predecessor, PageElement):
index = parent.index(self)
parent.insert(index, predecessor)
def insert_after(self, successor):
"""Makes the given element the immediate successor of this one.
The two elements will have the same parent, and the given element
will be immediately after this one.
raise ValueError("Can't insert an element after itself.")
"Element has no parent, so 'after' has no meaning.")
# Extract first so that the index won't be screwed up if they
if isinstance(successor, PageElement):
index = parent.index(self)
parent.insert(index+1, successor)
def find_next(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the first item that matches the given criteria and
appears after this Tag in the document."""
return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
findNext = find_next # BS3
def find_all_next(self, name=None, attrs={}, text=None, limit=None,
"""Returns all items that match the given criteria and appear
after this Tag in the document."""
return self._find_all(name, attrs, text, limit, self.next_elements,
findAllNext = find_all_next # BS3
def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the closest sibling to this Tag that matches the
given criteria and appears after this Tag in the document."""
return self._find_one(self.find_next_siblings, name, attrs, text,
findNextSibling = find_next_sibling # BS3
def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
"""Returns the siblings of this Tag that match the given
criteria and appear after this Tag in the document."""
return self._find_all(name, attrs, text, limit,
self.next_siblings, **kwargs)
findNextSiblings = find_next_siblings # BS3
fetchNextSiblings = find_next_siblings # BS2