BeautifulSoup: Port fix from upstream for outputting bare ampersands in strings. Fixes #1769481 [Calibre inserting semicolons](https://bugs.launchpad.net/calibre/+bug/1769481)

This commit is contained in:
Kovid Goyal 2018-05-07 08:00:27 +05:30
parent 073ce5522f
commit 6e91a17650
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -101,6 +101,14 @@ class PageElement:
"""Contains the navigational information for some part of the page """Contains the navigational information for some part of the page
(either a tag or a piece of text)""" (either a tag or a piece of text)"""
XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
"quot" : '"',
"amp" : "&",
"lt" : "<",
"gt" : ">" }
XML_SPECIAL_CHARS_TO_ENTITIES = {v: k for k, v in XML_ENTITIES_TO_SPECIAL_CHARS.items()}
def setup(self, parent=None, previous=None): def setup(self, parent=None, previous=None):
"""Sets up the initial relations between this element and """Sets up the initial relations between this element and
other elements.""" other elements."""
@ -338,31 +346,31 @@ class PageElement:
#NavigableStrings and Tags. #NavigableStrings and Tags.
def nextGenerator(self): def nextGenerator(self):
i = self i = self
while i: while i is not None:
i = i.next i = i.next
yield i yield i
def nextSiblingGenerator(self): def nextSiblingGenerator(self):
i = self i = self
while i: while i is not None:
i = i.nextSibling i = i.nextSibling
yield i yield i
def previousGenerator(self): def previousGenerator(self):
i = self i = self
while i: while i is not None:
i = i.previous i = i.previous
yield i yield i
def previousSiblingGenerator(self): def previousSiblingGenerator(self):
i = self i = self
while i: while i is not None:
i = i.previousSibling i = i.previousSibling
yield i yield i
def parentGenerator(self): def parentGenerator(self):
i = self i = self
while i: while i is not None:
i = i.parent i = i.parent
yield i yield i
@ -389,6 +397,16 @@ class PageElement:
s = unicode(s) s = unicode(s)
return s return s
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+ "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+ ")")
def _sub_entity(self, x):
"""Used with a regular expression to substitute the
appropriate XML entity for an XML special character."""
return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
class NavigableString(unicode, PageElement): class NavigableString(unicode, PageElement):
def __getnewargs__(self): def __getnewargs__(self):
@ -407,10 +425,12 @@ class NavigableString(unicode, PageElement):
return unicode(str(self), DEFAULT_OUTPUT_ENCODING) # Changed by Kovid return unicode(str(self), DEFAULT_OUTPUT_ENCODING) # Changed by Kovid
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
# Substitute outgoing XML entities.
data = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, self)
if encoding: if encoding:
return self.encode(encoding) return data.encode(encoding)
else: else:
return self return data
class CData(NavigableString): class CData(NavigableString):
@ -596,15 +616,6 @@ class Tag(PageElement):
def __unicode__(self): def __unicode__(self):
return self.__str__(None) return self.__str__(None)
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+ "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+ ")")
def _sub_entity(self, x):
"""Used with a regular expression to substitute the
appropriate XML entity for an XML special character."""
return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
prettyPrint=False, indentLevel=0): prettyPrint=False, indentLevel=0):
"""Returns a string or Unicode representation of this tag and """Returns a string or Unicode representation of this tag and