py3: more unicode porting

This commit is contained in:
Kovid Goyal 2019-06-01 14:47:05 +05:30
parent 20b065fb49
commit 151e736538
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 62 additions and 48 deletions

View File

@ -1,3 +1,4 @@
from __future__ import absolute_import, division, print_function, unicode_literals
''' '''
Basic support for manipulating OEB 1.x/2.0 content and metadata. Basic support for manipulating OEB 1.x/2.0 content and metadata.
''' '''
@ -22,7 +23,7 @@ from calibre.ebooks.oeb.parse_utils import (barename, XHTML_NS, RECOVER_PARSER,
namespace, XHTML, parse_html, NotHTML) namespace, XHTML, parse_html, NotHTML)
from calibre.utils.cleantext import clean_xml_chars from calibre.utils.cleantext import clean_xml_chars
from calibre.utils.short_uuid import uuid4 from calibre.utils.short_uuid import uuid4
from polyglot.builtins import iteritems, unicode_type, string_or_bytes, range, itervalues, filter from polyglot.builtins import iteritems, unicode_type, string_or_bytes, range, itervalues, filter, codepoint_to_chr
from polyglot.urllib import unquote as urlunquote, urldefrag, urljoin, urlparse, urlunparse from polyglot.urllib import unquote as urlunquote, urldefrag, urljoin, urlparse, urlunparse
from calibre.utils.icu import numeric_sort_key from calibre.utils.icu import numeric_sort_key
@ -148,7 +149,7 @@ def close_self_closing_tags(raw):
def uuid_id(): def uuid_id():
return u'u'+uuid4() return 'u' + uuid4()
def itercsslinks(raw): def itercsslinks(raw):
@ -169,12 +170,12 @@ def iterlinks(root, find_links_in_css=True):
''' '''
assert etree.iselement(root) assert etree.iselement(root)
for el in root.iter(): for el in root.iter('*'):
attribs = el.attrib
try: try:
tag = barename(el.tag).lower() tag = barename(el.tag).lower()
except Exception: except Exception:
continue continue
attribs = el.attrib
if tag == 'object': if tag == 'object':
codebase = None codebase = None
@ -323,7 +324,7 @@ PNG_MIME = types_map['.png']
SVG_MIME = types_map['.svg'] SVG_MIME = types_map['.svg']
BINARY_MIME = 'application/octet-stream' BINARY_MIME = 'application/octet-stream'
XHTML_CSS_NAMESPACE = u'@namespace "%s";\n' % XHTML_NS XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
OEB_STYLES = {CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css', 'xhtml/css'} OEB_STYLES = {CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css', 'xhtml/css'}
OEB_DOCS = {XHTML_MIME, 'text/html', OEB_DOC_MIME, OEB_DOCS = {XHTML_MIME, 'text/html', OEB_DOC_MIME,
@ -394,7 +395,7 @@ def xml2str(root, pretty_print=False, strip_comments=False, with_tail=True):
pretty_print=pretty_print, with_tail=with_tail) pretty_print=pretty_print, with_tail=with_tail)
if strip_comments: if strip_comments:
ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans) ans = re.compile(br'<!--.*?-->', re.DOTALL).sub(b'', ans)
return ans return ans
@ -432,12 +433,15 @@ def serialize(data, media_type, pretty_print=False):
return bytes(data) return bytes(data)
ASCII_CHARS = set(chr(x) for x in range(128)) ASCII_CHARS = frozenset(codepoint_to_chr(x) for x in range(128))
UNIBYTE_CHARS = set(chr(x) for x in range(256)) UNIBYTE_CHARS = frozenset(x.encode('ascii') for x in ASCII_CHARS)
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' USAFE = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
'abcdefghijklmnopqrstuvwxyz' 'abcdefghijklmnopqrstuvwxyz'
'0123456789' '_.-/~') '0123456789' '_.-/~')
URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE] URL_SAFE = frozenset(USAFE)
URL_SAFE_BYTES = frozenset(USAFE.encode('ascii'))
URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE_BYTES]
del USAFE
def urlquote(href): def urlquote(href):
@ -445,13 +449,16 @@ def urlquote(href):
That is, this function returns valid IRIs not valid URIs. In particular, That is, this function returns valid IRIs not valid URIs. In particular,
IRIs can contain non-ascii characters. """ IRIs can contain non-ascii characters. """
result = [] result = []
unsafe = 0 if isinstance(href, unicode_type) else 1 isbytes = isinstance(href, bytes)
unsafe = URL_UNSAFE[unsafe] unsafe = URL_UNSAFE[int(isbytes)]
esc, join = "%%%02x", ''
if isbytes:
esc, join = esc.encode('ascii'), b''
for char in href: for char in href:
if char in unsafe: if char in unsafe:
char = "%%%02x" % ord(char) char = esc % ord(char)
result.append(char) result.append(char)
return ''.join(result) return join.join(result)
def urlnormalize(href): def urlnormalize(href):
@ -852,7 +859,7 @@ class Metadata(object):
def to_opf1(self, parent=None): def to_opf1(self, parent=None):
nsmap = self._opf1_nsmap nsmap = self._opf1_nsmap
nsrmap = dict((value, key) for key, value in nsmap.items()) nsrmap = {value: key for key, value in iteritems(nsmap)}
elem = element(parent, 'metadata', nsmap=nsmap) elem = element(parent, 'metadata', nsmap=nsmap)
dcmeta = element(elem, 'dc-metadata', nsmap=OPF1_NSMAP) dcmeta = element(elem, 'dc-metadata', nsmap=OPF1_NSMAP)
xmeta = element(elem, 'x-metadata') xmeta = element(elem, 'x-metadata')
@ -866,7 +873,7 @@ class Metadata(object):
def to_opf2(self, parent=None): def to_opf2(self, parent=None):
nsmap = self._opf2_nsmap nsmap = self._opf2_nsmap
nsrmap = dict((value, key) for key, value in nsmap.items()) nsrmap = {value: key for key, value in iteritems(nsmap)}
elem = element(parent, OPF('metadata'), nsmap=nsmap) elem = element(parent, OPF('metadata'), nsmap=nsmap)
for term in self.items: for term in self.items:
for item in self.items[term]: for item in self.items[term]:
@ -935,10 +942,10 @@ class Manifest(object):
# Parsing {{{ # Parsing {{{
def _parse_xml(self, data): def _parse_xml(self, data):
if not data:
return
data = xml_to_unicode(data, strip_encoding_pats=True, data = xml_to_unicode(data, strip_encoding_pats=True,
assume_utf8=True, resolve_entities=True)[0] assume_utf8=True, resolve_entities=True)[0]
if not data:
return None
return etree.fromstring(data, parser=RECOVER_PARSER) return etree.fromstring(data, parser=RECOVER_PARSER)
def _parse_xhtml(self, data): def _parse_xhtml(self, data):
@ -956,7 +963,10 @@ class Manifest(object):
return data return data
def _parse_txt(self, data): def _parse_txt(self, data):
if '<html>' in data: has_html = '<html>'
if isinstance(data, bytes):
has_html = has_html.encode('ascii')
if has_html in data:
return self._parse_xhtml(data) return self._parse_xhtml(data)
self.oeb.log.debug('Converting', self.href, '...') self.oeb.log.debug('Converting', self.href, '...')
@ -1202,7 +1212,7 @@ class Manifest(object):
base = id base = id
index = 1 index = 1
while id in self.ids: while id in self.ids:
id = base + str(index) id = base + unicode_type(index)
index += 1 index += 1
if href is not None: if href is not None:
href = urlnormalize(href) href = urlnormalize(href)
@ -1210,7 +1220,7 @@ class Manifest(object):
index = 1 index = 1
lhrefs = {x.lower() for x in self.hrefs} lhrefs = {x.lower() for x in self.hrefs}
while href.lower() in lhrefs: while href.lower() in lhrefs:
href = base + str(index) + ext href = base + unicode_type(index) + ext
index += 1 index += 1
return id, unicode_type(href) return id, unicode_type(href)
@ -1600,17 +1610,17 @@ class TOC(object):
return 1 return 1
def get_lines(self, lvl=0): def get_lines(self, lvl=0):
ans = [(u'\t'*lvl) + u'TOC: %s --> %s'%(self.title, self.href)] ans = [('\t'*lvl) + 'TOC: %s --> %s'%(self.title, self.href)]
for child in self: for child in self:
ans.extend(child.get_lines(lvl+1)) ans.extend(child.get_lines(lvl+1))
return ans return ans
if ispy3: if ispy3:
def __str__(self): def __str__(self):
return u'\n'.join(self.get_lines()) return '\n'.join(self.get_lines())
else: else:
def __unicode__(self): def __unicode__(self):
return u'\n'.join(self.get_lines()) return '\n'.join(self.get_lines())
def __str__(self): def __str__(self):
return b'\n'.join([x.encode('utf-8') for x in self.get_lines()]) return b'\n'.join([x.encode('utf-8') for x in self.get_lines()])
@ -1734,11 +1744,11 @@ class PageList(object):
def to_ncx(self, parent=None): def to_ncx(self, parent=None):
plist = element(parent, NCX('pageList'), id=uuid_id()) plist = element(parent, NCX('pageList'), id=uuid_id())
values = dict((t, count(1)) for t in ('front', 'normal', 'special')) values = {t: count(1) for t in ('front', 'normal', 'special')}
for page in self.pages: for page in self.pages:
id = page.id or uuid_id() id = page.id or uuid_id()
type = page.type type = page.type
value = str(next(values[type])) value = unicode_type(next(values[type]))
attrib = {'id': id, 'value': value, 'type': type, 'playOrder': '0'} attrib = {'id': id, 'value': value, 'type': type, 'playOrder': '0'}
if page.klass: if page.klass:
attrib['class'] = page.klass attrib['class'] = page.klass
@ -1831,7 +1841,7 @@ class OEBBook(object):
def translate(self, text): def translate(self, text):
"""Translate :param:`text` into the book's primary language.""" """Translate :param:`text` into the book's primary language."""
lang = str(self.metadata.language[0]) lang = unicode_type(self.metadata.language[0])
lang = lang.split('-', 1)[0].lower() lang = lang.split('-', 1)[0].lower()
return translate(lang, text) return translate(lang, text)
@ -1842,14 +1852,14 @@ class OEBBook(object):
if isinstance(data, unicode_type): if isinstance(data, unicode_type):
return fix_data(data) return fix_data(data)
bom_enc = None bom_enc = None
if data[:4] in ('\0\0\xfe\xff', '\xff\xfe\0\0'): if data[:4] in (b'\0\0\xfe\xff', b'\xff\xfe\0\0'):
bom_enc = {'\0\0\xfe\xff':'utf-32-be', bom_enc = {b'\0\0\xfe\xff':'utf-32-be',
'\xff\xfe\0\0':'utf-32-le'}[data[:4]] b'\xff\xfe\0\0':'utf-32-le'}[data[:4]]
data = data[4:] data = data[4:]
elif data[:2] in ('\xff\xfe', '\xfe\xff'): elif data[:2] in (b'\xff\xfe', b'\xfe\xff'):
bom_enc = {'\xff\xfe':'utf-16-le', '\xfe\xff':'utf-16-be'}[data[:2]] bom_enc = {b'\xff\xfe':'utf-16-le', 'b\xfe\xff':'utf-16-be'}[data[:2]]
data = data[2:] data = data[2:]
elif data[:3] == '\xef\xbb\xbf': elif data[:3] == b'\xef\xbb\xbf':
bom_enc = 'utf-8' bom_enc = 'utf-8'
data = data[3:] data = data[3:]
if bom_enc is not None: if bom_enc is not None:

View File

@ -1,5 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
''' '''
Read content from ereader pdb file with a 116 and 202 byte header created by Makebook. Read content from ereader pdb file with a 116 and 202 byte header created by Makebook.
''' '''
@ -57,15 +59,17 @@ class Reader202(FormatReader):
def decompress_text(self, number): def decompress_text(self, number):
from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.compression.palmdoc import decompress_doc
return decompress_doc(''.join([chr(ord(x) ^ 0xA5) for x in self.section_data(number)])).decode('cp1252' if self.encoding is None else self.encoding, 'replace') # noqa data = bytearray(self.section_data(number))
data = bytes(bytearray(x ^ 0xA5 for x in data))
return decompress_doc(data).decode(self.encoding or 'cp1252', 'replace')
def get_image(self, number): def get_image(self, number):
name = None name = None
img = None img = None
data = self.section_data(number) data = self.section_data(number)
if data.startswith('PNG'): if data.startswith(b'PNG'):
name = data[4:4 + 32].strip('\x00') name = data[4:4 + 32].strip(b'\x00')
img = data[62:] img = data[62:]
return name, img return name, img
@ -89,7 +93,7 @@ class Reader202(FormatReader):
if not os.path.exists(output_dir): if not os.path.exists(output_dir):
os.makedirs(output_dir) os.makedirs(output_dir)
pml = u'' pml = ''
for i in range(1, self.header_record.num_text_pages + 1): for i in range(1, self.header_record.num_text_pages + 1):
self.log.debug('Extracting text page %i' % i) self.log.debug('Extracting text page %i' % i)
pml += self.get_text_page(i) pml += self.get_text_page(i)
@ -98,7 +102,7 @@ class Reader202(FormatReader):
if not isinstance(title, unicode_type): if not isinstance(title, unicode_type):
title = title.decode('utf-8', 'replace') title = title.decode('utf-8', 'replace')
html = u'<html><head><title>%s</title></head><body>%s</body></html>' % \ html = '<html><head><title>%s</title></head><body>%s</body></html>' % \
(title, pml_to_html(pml)) (title, pml_to_html(pml))
with CurrentDir(output_dir): with CurrentDir(output_dir):

View File

@ -10,7 +10,7 @@ import re
from itertools import groupby from itertools import groupby
from operator import itemgetter from operator import itemgetter
from collections import Counter, OrderedDict from collections import Counter, OrderedDict
from polyglot.builtins import iteritems, map, zip, unicode_type from polyglot.builtins import iteritems, map, zip, unicode_type, codepoint_to_chr
from calibre import as_unicode from calibre import as_unicode
from calibre.ebooks.pdf.render.common import (Array, String, Stream, from calibre.ebooks.pdf.render.common import (Array, String, Stream,
@ -123,7 +123,7 @@ class Font(object):
self.metrics, self.compress = metrics, compress self.metrics, self.compress = metrics, compress
self.is_otf = self.metrics.is_otf self.is_otf = self.metrics.is_otf
self.subset_tag = unicode_type( self.subset_tag = unicode_type(
re.sub('.', lambda m: chr(int(m.group())+ord('A')), oct(num).replace('o', '') re.sub('.', lambda m: codepoint_to_chr(int(m.group())+ord('A')), oct(num).replace('o', '')
)).rjust(6, 'A') )).rjust(6, 'A')
self.font_stream = FontStream(metrics.is_otf, compress=compress) self.font_stream = FontStream(metrics.is_otf, compress=compress)
try: try:

View File

@ -19,7 +19,7 @@ from calibre.constants import plugins
from calibre.gui2.widgets2 import HistoryLineEdit2 from calibre.gui2.widgets2 import HistoryLineEdit2
from calibre.gui2.tweak_book import tprefs from calibre.gui2.tweak_book import tprefs
from calibre.gui2.tweak_book.widgets import Dialog, BusyCursor from calibre.gui2.tweak_book.widgets import Dialog, BusyCursor
from calibre.utils.icu import safe_chr as chr from calibre.utils.icu import safe_chr as codepoint_to_chr
from calibre.utils.unicode_names import character_name_from_code, points_for_word from calibre.utils.unicode_names import character_name_from_code, points_for_word
from polyglot.builtins import unicode_type, range, map from polyglot.builtins import unicode_type, range, map
@ -570,7 +570,7 @@ class CharDelegate(QStyledItemDelegate):
f = option.font f = option.font
f.setPixelSize(option.rect.height() - 8) f.setPixelSize(option.rect.height() - 8)
painter.setFont(f) painter.setFont(f)
painter.drawText(option.rect, Qt.AlignHCenter | Qt.AlignBottom | Qt.TextSingleLine, chr(charcode)) painter.drawText(option.rect, Qt.AlignHCenter | Qt.AlignBottom | Qt.TextSingleLine, codepoint_to_chr(charcode))
def paint_non_printing(self, painter, option, charcode): def paint_non_printing(self, painter, option, charcode):
text = self.np_pat.sub(r'\n\1', non_printing[charcode]) text = self.np_pat.sub(r'\n\1', non_printing[charcode])
@ -612,7 +612,7 @@ class CharView(QListView):
except (TypeError, ValueError): except (TypeError, ValueError):
pass pass
else: else:
self.char_selected.emit(chr(char_code)) self.char_selected.emit(codepoint_to_chr(char_code))
def set_allow_drag_and_drop(self, enabled): def set_allow_drag_and_drop(self, enabled):
if not enabled: if not enabled:
@ -663,9 +663,9 @@ class CharView(QListView):
pass pass
else: else:
m = QMenu(self) m = QMenu(self)
m.addAction(QIcon(I('edit-copy.png')), _('Copy %s to clipboard') % chr(char_code), partial(self.copy_to_clipboard, char_code)) m.addAction(QIcon(I('edit-copy.png')), _('Copy %s to clipboard') % codepoint_to_chr(char_code), partial(self.copy_to_clipboard, char_code))
m.addAction(QIcon(I('rating.png')), m.addAction(QIcon(I('rating.png')),
(_('Remove %s from favorites') if self.showing_favorites else _('Add %s to favorites')) % chr(char_code), (_('Remove %s from favorites') if self.showing_favorites else _('Add %s to favorites')) % codepoint_to_chr(char_code),
partial(self.remove_from_favorites, char_code)) partial(self.remove_from_favorites, char_code))
if self.showing_favorites: if self.showing_favorites:
m.addAction(_('Restore favorites to defaults'), self.restore_defaults) m.addAction(_('Restore favorites to defaults'), self.restore_defaults)
@ -679,7 +679,7 @@ class CharView(QListView):
def copy_to_clipboard(self, char_code): def copy_to_clipboard(self, char_code):
c = QApplication.clipboard() c = QApplication.clipboard()
c.setText(chr(char_code)) c.setText(codepoint_to_chr(char_code))
def remove_from_favorites(self, char_code): def remove_from_favorites(self, char_code):
existing = tprefs['charmap_favorites'] existing = tprefs['charmap_favorites']