py3: add polyglot imports for BeautifulSoup3

upstream bs4 has changed self.unicode to self.unicode_markup, but
calibre does not use UnicodeDammit. Leave this in its historic, horribly
confusing state, as it should not cause harm to have a class instance
attribute with the same name as a python2 object type.
This commit is contained in:
Eli Schwartz 2019-03-11 12:30:26 -04:00 committed by Kovid Goyal
parent cbc42bec23
commit 56af613e10
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -76,8 +76,7 @@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
""" """
from __future__ import generators from __future__ import generators, print_function
from __future__ import print_function
__author__ = "Leonard Richardson (leonardr@segfault.org)" __author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "3.0.5" __version__ = "3.0.5"
@ -90,6 +89,7 @@ import types
import re import re
import calibre.ebooks.sgmllib as sgmllib import calibre.ebooks.sgmllib as sgmllib
from htmlentitydefs import name2codepoint from htmlentitydefs import name2codepoint
from polyglot.builtins import codepoint_to_chr, unicode_type
#This hack makes Beautiful Soup able to parse XML with namespaces #This hack makes Beautiful Soup able to parse XML with namespaces
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
@ -178,7 +178,7 @@ class PageElement:
def insert(self, position, newChild): def insert(self, position, newChild):
if (isinstance(newChild, basestring) if (isinstance(newChild, basestring)
or isinstance(newChild, unicode)) \ or isinstance(newChild, unicode_type)) \
and not isinstance(newChild, NavigableString): and not isinstance(newChild, NavigableString):
newChild = NavigableString(newChild) newChild = NavigableString(newChild)
@ -383,19 +383,19 @@ class PageElement:
def toEncoding(self, s, encoding=None): def toEncoding(self, s, encoding=None):
"""Encodes an object to a string in some encoding, or to Unicode. """Encodes an object to a string in some encoding, or to Unicode.
.""" ."""
if isinstance(s, unicode): if isinstance(s, unicode_type):
if encoding: if encoding:
s = s.encode(encoding) s = s.encode(encoding)
elif isinstance(s, str): elif isinstance(s, str):
if encoding: if encoding:
s = s.encode(encoding) s = s.encode(encoding)
else: else:
s = unicode(s) s = unicode_type(s)
else: else:
if encoding: if encoding:
s = self.toEncoding(str(s), encoding) s = self.toEncoding(str(s), encoding)
else: else:
s = unicode(s) s = unicode_type(s)
return s return s
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
@ -408,7 +408,7 @@ class PageElement:
return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
class NavigableString(unicode, PageElement): class NavigableString(unicode_type, PageElement):
def __getnewargs__(self): def __getnewargs__(self):
return (NavigableString.__str__(self),) return (NavigableString.__str__(self),)
@ -423,7 +423,7 @@ class NavigableString(unicode, PageElement):
raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)) raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__.__name__, attr))
def __unicode__(self): def __unicode__(self):
return unicode(str(self), DEFAULT_OUTPUT_ENCODING) # Changed by Kovid return unicode_type(str(self), DEFAULT_OUTPUT_ENCODING) # Changed by Kovid
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
# Substitute outgoing XML entities. # Substitute outgoing XML entities.
@ -479,7 +479,7 @@ class Tag(PageElement):
escaped.""" escaped."""
x = match.group(1) x = match.group(1)
if self.convertHTMLEntities and x in name2codepoint: if self.convertHTMLEntities and x in name2codepoint:
return unichr(name2codepoint[x]) return codepoint_to_chr(name2codepoint[x])
elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
if self.convertXMLEntities: if self.convertXMLEntities:
return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
@ -488,9 +488,9 @@ class Tag(PageElement):
elif len(x) > 0 and x[0] == '#': elif len(x) > 0 and x[0] == '#':
# Handle numeric entities # Handle numeric entities
if len(x) > 1 and x[1] == 'x': if len(x) > 1 and x[1] == 'x':
return unichr(int(x[2:], 16)) return codepoint_to_chr(int(x[2:], 16))
else: else:
return unichr(int(x[1:])) return codepoint_to_chr(int(x[1:]))
elif self.escapeUnrecognizedEntities: elif self.escapeUnrecognizedEntities:
return u'&amp;%s;' % x return u'&amp;%s;' % x
@ -899,7 +899,7 @@ class SoupStrainer:
if isinstance(markup, Tag): if isinstance(markup, Tag):
markup = markup.name markup = markup.name
if markup and not isString(markup): if markup and not isString(markup):
markup = unicode(markup) markup = unicode_type(markup)
#Now we know that chunk is either a string, or None. #Now we know that chunk is either a string, or None.
if hasattr(matchAgainst, 'match'): if hasattr(matchAgainst, 'match'):
# It's a regexp object. # It's a regexp object.
@ -909,8 +909,8 @@ class SoupStrainer:
elif hasattr(matchAgainst, 'items'): elif hasattr(matchAgainst, 'items'):
result = markup.has_key(matchAgainst) result = markup.has_key(matchAgainst)
elif matchAgainst and isString(markup): elif matchAgainst and isString(markup):
if isinstance(markup, unicode): if isinstance(markup, unicode_type):
matchAgainst = unicode(matchAgainst) matchAgainst = unicode_type(matchAgainst)
else: else:
matchAgainst = str(matchAgainst) matchAgainst = str(matchAgainst)
@ -937,7 +937,7 @@ def isString(s):
"""Convenience method that works with all 2.x versions of Python """Convenience method that works with all 2.x versions of Python
to determine whether or not something is stringlike.""" to determine whether or not something is stringlike."""
try: try:
return isinstance(s, unicode) or isinstance(s, basestring) return isinstance(s, unicode_type) or isinstance(s, basestring)
except NameError: except NameError:
return isinstance(s, str) return isinstance(s, str)
@ -1088,7 +1088,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
def _feed(self, inDocumentEncoding=None): def _feed(self, inDocumentEncoding=None):
# Convert the document to Unicode. # Convert the document to Unicode.
markup = self.markup markup = self.markup
if isinstance(markup, unicode): if isinstance(markup, unicode_type):
if not hasattr(self, 'originalEncoding'): if not hasattr(self, 'originalEncoding'):
self.originalEncoding = None self.originalEncoding = None
else: else:
@ -1328,7 +1328,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
if ref.lower().startswith('x'): # if ref.lower().startswith('x'): #
ref = int(ref[1:], 16) # Added by Kovid to handle hex numeric entities ref = int(ref[1:], 16) # Added by Kovid to handle hex numeric entities
try: try:
data = unichr(int(ref)) data = codepoint_to_chr(int(ref))
except ValueError: # Bad numerical entity. Added by Kovid except ValueError: # Bad numerical entity. Added by Kovid
data = u'' data = u''
else: else:
@ -1342,7 +1342,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
data = None data = None
if self.convertHTMLEntities: if self.convertHTMLEntities:
try: try:
data = unichr(name2codepoint[ref]) data = codepoint_to_chr(name2codepoint[ref])
except KeyError: except KeyError:
pass pass
@ -1689,9 +1689,9 @@ class UnicodeDammit:
self.smartQuotesTo = smartQuotesTo self.smartQuotesTo = smartQuotesTo
self.triedEncodings = [] self.triedEncodings = []
if markup == '' or isinstance(markup, unicode): if markup == '' or isinstance(markup, unicode_type):
self.originalEncoding = None self.originalEncoding = None
self.unicode = unicode(markup) self.unicode = unicode_type(markup)
return return
u = None u = None
@ -1704,7 +1704,7 @@ class UnicodeDammit:
if u: break if u: break
# If no luck and we have auto-detection library, try that: # If no luck and we have auto-detection library, try that:
if not u and chardet and not isinstance(self.markup, unicode): if not u and chardet and not isinstance(self.markup, unicode_type):
u = self._convertFrom(chardet.detect(self.markup)['encoding']) u = self._convertFrom(chardet.detect(self.markup)['encoding'])
# As a last resort, try utf-8 and windows-1252: # As a last resort, try utf-8 and windows-1252:
@ -1777,7 +1777,7 @@ class UnicodeDammit:
encoding = 'utf-32le' encoding = 'utf-32le'
data = data[4:] data = data[4:]
newdata = unicode(data, encoding) newdata = unicode_type(data, encoding)
return newdata return newdata