py3: add polyglot imports for BeautifulSoup3

upstream bs4 has changed self.unicode to self.unicode_markup, but calibre does not use UnicodeDammit. Leave this in its historic, horribly confusing state, as it should not cause harm to have a class instance attribute with the same name as a python2 object type.
2025-07-09 03:04:10 -04:00 · 2019-03-11 12:30:26 -04:00 · 2019-03-11 12:30:26 -04:00 · 56af613e10
commit 56af613e10
parent cbc42bec23
1 changed files with 22 additions and 22 deletions
--- a/src/calibre/ebooks/BeautifulSoup.py
+++ b/src/calibre/ebooks/BeautifulSoup.py
@ -76,8 +76,7 @@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.

 """
-from __future__ import generators
-from __future__ import print_function
+from __future__ import generators, print_function

 __author__ = "Leonard Richardson (leonardr@segfault.org)"
 __version__ = "3.0.5"
@ -90,6 +89,7 @@ import types
 import re
 import calibre.ebooks.sgmllib as sgmllib
 from htmlentitydefs import name2codepoint
+from polyglot.builtins import codepoint_to_chr, unicode_type

 #This hack makes Beautiful Soup able to parse XML with namespaces
 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
@ -178,7 +178,7 @@ class PageElement:

    def insert(self, position, newChild):
        if (isinstance(newChild, basestring)
-            or isinstance(newChild, unicode)) \
+            or isinstance(newChild, unicode_type)) \
            and not isinstance(newChild, NavigableString):
            newChild = NavigableString(newChild)

@ -383,19 +383,19 @@ class PageElement:
    def toEncoding(self, s, encoding=None):
        """Encodes an object to a string in some encoding, or to Unicode.
        ."""
-        if isinstance(s, unicode):
+        if isinstance(s, unicode_type):
            if encoding:
                s = s.encode(encoding)
        elif isinstance(s, str):
            if encoding:
                s = s.encode(encoding)
            else:
-                s = unicode(s)
+                s = unicode_type(s)
        else:
            if encoding:
                s  = self.toEncoding(str(s), encoding)
            else:
-                s = unicode(s)
+                s = unicode_type(s)
        return s

    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
@ -408,7 +408,7 @@ class PageElement:
        return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"


-class NavigableString(unicode, PageElement):
+class NavigableString(unicode_type, PageElement):

    def __getnewargs__(self):
        return (NavigableString.__str__(self),)
@ -423,7 +423,7 @@ class NavigableString(unicode, PageElement):
            raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__.__name__, attr))

    def __unicode__(self):
-        return unicode(str(self), DEFAULT_OUTPUT_ENCODING) # Changed by Kovid
+        return unicode_type(str(self), DEFAULT_OUTPUT_ENCODING) # Changed by Kovid

    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
        # Substitute outgoing XML entities.
@ -479,7 +479,7 @@ class Tag(PageElement):
        escaped."""
        x = match.group(1)
        if self.convertHTMLEntities and x in name2codepoint:
-            return unichr(name2codepoint[x])
+            return codepoint_to_chr(name2codepoint[x])
        elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
            if self.convertXMLEntities:
                return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
@ -488,9 +488,9 @@ class Tag(PageElement):
        elif len(x) > 0 and x[0] == '#':
            # Handle numeric entities
            if len(x) > 1 and x[1] == 'x':
-                return unichr(int(x[2:], 16))
+                return codepoint_to_chr(int(x[2:], 16))
            else:
-                return unichr(int(x[1:]))
+                return codepoint_to_chr(int(x[1:]))

        elif self.escapeUnrecognizedEntities:
            return u'&amp;%s;' % x
@ -899,7 +899,7 @@ class SoupStrainer:
            if isinstance(markup, Tag):
                markup = markup.name
            if markup and not isString(markup):
-                markup = unicode(markup)
+                markup = unicode_type(markup)
            #Now we know that chunk is either a string, or None.
            if hasattr(matchAgainst, 'match'):
                # It's a regexp object.
@ -909,8 +909,8 @@ class SoupStrainer:
            elif hasattr(matchAgainst, 'items'):
                result = markup.has_key(matchAgainst)
            elif matchAgainst and isString(markup):
-                if isinstance(markup, unicode):
-                    matchAgainst = unicode(matchAgainst)
+                if isinstance(markup, unicode_type):
+                    matchAgainst = unicode_type(matchAgainst)
                else:
                    matchAgainst = str(matchAgainst)

@ -937,7 +937,7 @@ def isString(s):
    """Convenience method that works with all 2.x versions of Python
    to determine whether or not something is stringlike."""
    try:
-        return isinstance(s, unicode) or isinstance(s, basestring)
+        return isinstance(s, unicode_type) or isinstance(s, basestring)
    except NameError:
        return isinstance(s, str)

@ -1088,7 +1088,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
    def _feed(self, inDocumentEncoding=None):
        # Convert the document to Unicode.
        markup = self.markup
-        if isinstance(markup, unicode):
+        if isinstance(markup, unicode_type):
            if not hasattr(self, 'originalEncoding'):
                self.originalEncoding = None
        else:
@ -1328,7 +1328,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
            if ref.lower().startswith('x'): #
                ref = int(ref[1:], 16)      # Added by Kovid to handle hex numeric entities
            try:
-                data = unichr(int(ref))
+                data = codepoint_to_chr(int(ref))
            except ValueError: # Bad numerical entity. Added by Kovid
                data = u''
        else:
@ -1342,7 +1342,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
        data = None
        if self.convertHTMLEntities:
            try:
-                data = unichr(name2codepoint[ref])
+                data = codepoint_to_chr(name2codepoint[ref])
            except KeyError:
                pass

@ -1689,9 +1689,9 @@ class UnicodeDammit:
        self.smartQuotesTo = smartQuotesTo
        self.triedEncodings = []

-        if markup == '' or isinstance(markup, unicode):
+        if markup == '' or isinstance(markup, unicode_type):
            self.originalEncoding = None
-            self.unicode = unicode(markup)
+            self.unicode = unicode_type(markup)
            return

        u = None
@ -1704,7 +1704,7 @@ class UnicodeDammit:
                if u: break

        # If no luck and we have auto-detection library, try that:
-        if not u and chardet and not isinstance(self.markup, unicode):
+        if not u and chardet and not isinstance(self.markup, unicode_type):
            u = self._convertFrom(chardet.detect(self.markup)['encoding'])

        # As a last resort, try utf-8 and windows-1252:
@ -1777,7 +1777,7 @@ class UnicodeDammit:
            encoding = 'utf-32le'
            data = data[4:]

-        newdata = unicode(data, encoding)
+        newdata = unicode_type(data, encoding)

        return newdata