mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
py3: add polyglot imports for BeautifulSoup3
upstream bs4 has changed self.unicode to self.unicode_markup, but calibre does not use UnicodeDammit. Leave this in its historic, horribly confusing state, as it should not cause harm to have a class instance attribute with the same name as a python2 object type.
This commit is contained in:
parent
cbc42bec23
commit
56af613e10
@ -76,8 +76,7 @@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
|
||||
|
||||
"""
|
||||
from __future__ import generators
|
||||
from __future__ import print_function
|
||||
from __future__ import generators, print_function
|
||||
|
||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||
__version__ = "3.0.5"
|
||||
@ -90,6 +89,7 @@ import types
|
||||
import re
|
||||
import calibre.ebooks.sgmllib as sgmllib
|
||||
from htmlentitydefs import name2codepoint
|
||||
from polyglot.builtins import codepoint_to_chr, unicode_type
|
||||
|
||||
#This hack makes Beautiful Soup able to parse XML with namespaces
|
||||
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
|
||||
@ -178,7 +178,7 @@ class PageElement:
|
||||
|
||||
def insert(self, position, newChild):
|
||||
if (isinstance(newChild, basestring)
|
||||
or isinstance(newChild, unicode)) \
|
||||
or isinstance(newChild, unicode_type)) \
|
||||
and not isinstance(newChild, NavigableString):
|
||||
newChild = NavigableString(newChild)
|
||||
|
||||
@ -383,19 +383,19 @@ class PageElement:
|
||||
def toEncoding(self, s, encoding=None):
|
||||
"""Encodes an object to a string in some encoding, or to Unicode.
|
||||
."""
|
||||
if isinstance(s, unicode):
|
||||
if isinstance(s, unicode_type):
|
||||
if encoding:
|
||||
s = s.encode(encoding)
|
||||
elif isinstance(s, str):
|
||||
if encoding:
|
||||
s = s.encode(encoding)
|
||||
else:
|
||||
s = unicode(s)
|
||||
s = unicode_type(s)
|
||||
else:
|
||||
if encoding:
|
||||
s = self.toEncoding(str(s), encoding)
|
||||
else:
|
||||
s = unicode(s)
|
||||
s = unicode_type(s)
|
||||
return s
|
||||
|
||||
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
|
||||
@ -408,7 +408,7 @@ class PageElement:
|
||||
return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
|
||||
|
||||
|
||||
class NavigableString(unicode, PageElement):
|
||||
class NavigableString(unicode_type, PageElement):
|
||||
|
||||
def __getnewargs__(self):
|
||||
return (NavigableString.__str__(self),)
|
||||
@ -423,7 +423,7 @@ class NavigableString(unicode, PageElement):
|
||||
raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__.__name__, attr))
|
||||
|
||||
def __unicode__(self):
|
||||
return unicode(str(self), DEFAULT_OUTPUT_ENCODING) # Changed by Kovid
|
||||
return unicode_type(str(self), DEFAULT_OUTPUT_ENCODING) # Changed by Kovid
|
||||
|
||||
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
|
||||
# Substitute outgoing XML entities.
|
||||
@ -479,7 +479,7 @@ class Tag(PageElement):
|
||||
escaped."""
|
||||
x = match.group(1)
|
||||
if self.convertHTMLEntities and x in name2codepoint:
|
||||
return unichr(name2codepoint[x])
|
||||
return codepoint_to_chr(name2codepoint[x])
|
||||
elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
|
||||
if self.convertXMLEntities:
|
||||
return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
|
||||
@ -488,9 +488,9 @@ class Tag(PageElement):
|
||||
elif len(x) > 0 and x[0] == '#':
|
||||
# Handle numeric entities
|
||||
if len(x) > 1 and x[1] == 'x':
|
||||
return unichr(int(x[2:], 16))
|
||||
return codepoint_to_chr(int(x[2:], 16))
|
||||
else:
|
||||
return unichr(int(x[1:]))
|
||||
return codepoint_to_chr(int(x[1:]))
|
||||
|
||||
elif self.escapeUnrecognizedEntities:
|
||||
return u'&%s;' % x
|
||||
@ -899,7 +899,7 @@ class SoupStrainer:
|
||||
if isinstance(markup, Tag):
|
||||
markup = markup.name
|
||||
if markup and not isString(markup):
|
||||
markup = unicode(markup)
|
||||
markup = unicode_type(markup)
|
||||
#Now we know that chunk is either a string, or None.
|
||||
if hasattr(matchAgainst, 'match'):
|
||||
# It's a regexp object.
|
||||
@ -909,8 +909,8 @@ class SoupStrainer:
|
||||
elif hasattr(matchAgainst, 'items'):
|
||||
result = markup.has_key(matchAgainst)
|
||||
elif matchAgainst and isString(markup):
|
||||
if isinstance(markup, unicode):
|
||||
matchAgainst = unicode(matchAgainst)
|
||||
if isinstance(markup, unicode_type):
|
||||
matchAgainst = unicode_type(matchAgainst)
|
||||
else:
|
||||
matchAgainst = str(matchAgainst)
|
||||
|
||||
@ -937,7 +937,7 @@ def isString(s):
|
||||
"""Convenience method that works with all 2.x versions of Python
|
||||
to determine whether or not something is stringlike."""
|
||||
try:
|
||||
return isinstance(s, unicode) or isinstance(s, basestring)
|
||||
return isinstance(s, unicode_type) or isinstance(s, basestring)
|
||||
except NameError:
|
||||
return isinstance(s, str)
|
||||
|
||||
@ -1088,7 +1088,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
||||
def _feed(self, inDocumentEncoding=None):
|
||||
# Convert the document to Unicode.
|
||||
markup = self.markup
|
||||
if isinstance(markup, unicode):
|
||||
if isinstance(markup, unicode_type):
|
||||
if not hasattr(self, 'originalEncoding'):
|
||||
self.originalEncoding = None
|
||||
else:
|
||||
@ -1328,7 +1328,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
||||
if ref.lower().startswith('x'): #
|
||||
ref = int(ref[1:], 16) # Added by Kovid to handle hex numeric entities
|
||||
try:
|
||||
data = unichr(int(ref))
|
||||
data = codepoint_to_chr(int(ref))
|
||||
except ValueError: # Bad numerical entity. Added by Kovid
|
||||
data = u''
|
||||
else:
|
||||
@ -1342,7 +1342,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
||||
data = None
|
||||
if self.convertHTMLEntities:
|
||||
try:
|
||||
data = unichr(name2codepoint[ref])
|
||||
data = codepoint_to_chr(name2codepoint[ref])
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
@ -1689,9 +1689,9 @@ class UnicodeDammit:
|
||||
self.smartQuotesTo = smartQuotesTo
|
||||
self.triedEncodings = []
|
||||
|
||||
if markup == '' or isinstance(markup, unicode):
|
||||
if markup == '' or isinstance(markup, unicode_type):
|
||||
self.originalEncoding = None
|
||||
self.unicode = unicode(markup)
|
||||
self.unicode = unicode_type(markup)
|
||||
return
|
||||
|
||||
u = None
|
||||
@ -1704,7 +1704,7 @@ class UnicodeDammit:
|
||||
if u: break
|
||||
|
||||
# If no luck and we have auto-detection library, try that:
|
||||
if not u and chardet and not isinstance(self.markup, unicode):
|
||||
if not u and chardet and not isinstance(self.markup, unicode_type):
|
||||
u = self._convertFrom(chardet.detect(self.markup)['encoding'])
|
||||
|
||||
# As a last resort, try utf-8 and windows-1252:
|
||||
@ -1777,7 +1777,7 @@ class UnicodeDammit:
|
||||
encoding = 'utf-32le'
|
||||
data = data[4:]
|
||||
|
||||
newdata = unicode(data, encoding)
|
||||
newdata = unicode_type(data, encoding)
|
||||
|
||||
return newdata
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user