py3: more unicode porting

This commit is contained in:
Kovid Goyal 2019-06-01 14:47:05 +05:30
parent 20b065fb49
commit 151e736538
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 62 additions and 48 deletions

View File

@ -1,3 +1,4 @@
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Basic support for manipulating OEB 1.x/2.0 content and metadata.
'''
@ -22,7 +23,7 @@ from calibre.ebooks.oeb.parse_utils import (barename, XHTML_NS, RECOVER_PARSER,
namespace, XHTML, parse_html, NotHTML)
from calibre.utils.cleantext import clean_xml_chars
from calibre.utils.short_uuid import uuid4
from polyglot.builtins import iteritems, unicode_type, string_or_bytes, range, itervalues, filter
from polyglot.builtins import iteritems, unicode_type, string_or_bytes, range, itervalues, filter, codepoint_to_chr
from polyglot.urllib import unquote as urlunquote, urldefrag, urljoin, urlparse, urlunparse
from calibre.utils.icu import numeric_sort_key
@ -148,7 +149,7 @@ def close_self_closing_tags(raw):
def uuid_id():
return u'u'+uuid4()
return 'u' + uuid4()
def itercsslinks(raw):
@ -169,12 +170,12 @@ def iterlinks(root, find_links_in_css=True):
'''
assert etree.iselement(root)
for el in root.iter():
attribs = el.attrib
for el in root.iter('*'):
try:
tag = barename(el.tag).lower()
except Exception:
continue
attribs = el.attrib
if tag == 'object':
codebase = None
@ -323,7 +324,7 @@ PNG_MIME = types_map['.png']
SVG_MIME = types_map['.svg']
BINARY_MIME = 'application/octet-stream'
XHTML_CSS_NAMESPACE = u'@namespace "%s";\n' % XHTML_NS
XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
OEB_STYLES = {CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css', 'xhtml/css'}
OEB_DOCS = {XHTML_MIME, 'text/html', OEB_DOC_MIME,
@ -394,7 +395,7 @@ def xml2str(root, pretty_print=False, strip_comments=False, with_tail=True):
pretty_print=pretty_print, with_tail=with_tail)
if strip_comments:
ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans)
ans = re.compile(br'<!--.*?-->', re.DOTALL).sub(b'', ans)
return ans
@ -432,12 +433,15 @@ def serialize(data, media_type, pretty_print=False):
return bytes(data)
ASCII_CHARS = set(chr(x) for x in range(128))
UNIBYTE_CHARS = set(chr(x) for x in range(256))
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
ASCII_CHARS = frozenset(codepoint_to_chr(x) for x in range(128))
UNIBYTE_CHARS = frozenset(x.encode('ascii') for x in ASCII_CHARS)
USAFE = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
'abcdefghijklmnopqrstuvwxyz'
'0123456789' '_.-/~')
URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE]
URL_SAFE = frozenset(USAFE)
URL_SAFE_BYTES = frozenset(USAFE.encode('ascii'))
URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE_BYTES]
del USAFE
def urlquote(href):
@ -445,13 +449,16 @@ def urlquote(href):
That is, this function returns valid IRIs not valid URIs. In particular,
IRIs can contain non-ascii characters. """
result = []
unsafe = 0 if isinstance(href, unicode_type) else 1
unsafe = URL_UNSAFE[unsafe]
isbytes = isinstance(href, bytes)
unsafe = URL_UNSAFE[int(isbytes)]
esc, join = "%%%02x", ''
if isbytes:
esc, join = esc.encode('ascii'), b''
for char in href:
if char in unsafe:
char = "%%%02x" % ord(char)
char = esc % ord(char)
result.append(char)
return ''.join(result)
return join.join(result)
def urlnormalize(href):
@ -852,7 +859,7 @@ class Metadata(object):
def to_opf1(self, parent=None):
nsmap = self._opf1_nsmap
nsrmap = dict((value, key) for key, value in nsmap.items())
nsrmap = {value: key for key, value in iteritems(nsmap)}
elem = element(parent, 'metadata', nsmap=nsmap)
dcmeta = element(elem, 'dc-metadata', nsmap=OPF1_NSMAP)
xmeta = element(elem, 'x-metadata')
@ -866,7 +873,7 @@ class Metadata(object):
def to_opf2(self, parent=None):
nsmap = self._opf2_nsmap
nsrmap = dict((value, key) for key, value in nsmap.items())
nsrmap = {value: key for key, value in iteritems(nsmap)}
elem = element(parent, OPF('metadata'), nsmap=nsmap)
for term in self.items:
for item in self.items[term]:
@ -935,10 +942,10 @@ class Manifest(object):
# Parsing {{{
def _parse_xml(self, data):
if not data:
return
data = xml_to_unicode(data, strip_encoding_pats=True,
assume_utf8=True, resolve_entities=True)[0]
if not data:
return None
return etree.fromstring(data, parser=RECOVER_PARSER)
def _parse_xhtml(self, data):
@ -956,7 +963,10 @@ class Manifest(object):
return data
def _parse_txt(self, data):
if '<html>' in data:
has_html = '<html>'
if isinstance(data, bytes):
has_html = has_html.encode('ascii')
if has_html in data:
return self._parse_xhtml(data)
self.oeb.log.debug('Converting', self.href, '...')
@ -1202,7 +1212,7 @@ class Manifest(object):
base = id
index = 1
while id in self.ids:
id = base + str(index)
id = base + unicode_type(index)
index += 1
if href is not None:
href = urlnormalize(href)
@ -1210,7 +1220,7 @@ class Manifest(object):
index = 1
lhrefs = {x.lower() for x in self.hrefs}
while href.lower() in lhrefs:
href = base + str(index) + ext
href = base + unicode_type(index) + ext
index += 1
return id, unicode_type(href)
@ -1600,17 +1610,17 @@ class TOC(object):
return 1
def get_lines(self, lvl=0):
ans = [(u'\t'*lvl) + u'TOC: %s --> %s'%(self.title, self.href)]
ans = [('\t'*lvl) + 'TOC: %s --> %s'%(self.title, self.href)]
for child in self:
ans.extend(child.get_lines(lvl+1))
return ans
if ispy3:
def __str__(self):
return u'\n'.join(self.get_lines())
return '\n'.join(self.get_lines())
else:
def __unicode__(self):
return u'\n'.join(self.get_lines())
return '\n'.join(self.get_lines())
def __str__(self):
return b'\n'.join([x.encode('utf-8') for x in self.get_lines()])
@ -1734,11 +1744,11 @@ class PageList(object):
def to_ncx(self, parent=None):
plist = element(parent, NCX('pageList'), id=uuid_id())
values = dict((t, count(1)) for t in ('front', 'normal', 'special'))
values = {t: count(1) for t in ('front', 'normal', 'special')}
for page in self.pages:
id = page.id or uuid_id()
type = page.type
value = str(next(values[type]))
value = unicode_type(next(values[type]))
attrib = {'id': id, 'value': value, 'type': type, 'playOrder': '0'}
if page.klass:
attrib['class'] = page.klass
@ -1831,7 +1841,7 @@ class OEBBook(object):
def translate(self, text):
"""Translate :param:`text` into the book's primary language."""
lang = str(self.metadata.language[0])
lang = unicode_type(self.metadata.language[0])
lang = lang.split('-', 1)[0].lower()
return translate(lang, text)
@ -1842,14 +1852,14 @@ class OEBBook(object):
if isinstance(data, unicode_type):
return fix_data(data)
bom_enc = None
if data[:4] in ('\0\0\xfe\xff', '\xff\xfe\0\0'):
bom_enc = {'\0\0\xfe\xff':'utf-32-be',
'\xff\xfe\0\0':'utf-32-le'}[data[:4]]
if data[:4] in (b'\0\0\xfe\xff', b'\xff\xfe\0\0'):
bom_enc = {b'\0\0\xfe\xff':'utf-32-be',
b'\xff\xfe\0\0':'utf-32-le'}[data[:4]]
data = data[4:]
elif data[:2] in ('\xff\xfe', '\xfe\xff'):
bom_enc = {'\xff\xfe':'utf-16-le', '\xfe\xff':'utf-16-be'}[data[:2]]
elif data[:2] in (b'\xff\xfe', b'\xfe\xff'):
bom_enc = {b'\xff\xfe':'utf-16-le', 'b\xfe\xff':'utf-16-be'}[data[:2]]
data = data[2:]
elif data[:3] == '\xef\xbb\xbf':
elif data[:3] == b'\xef\xbb\xbf':
bom_enc = 'utf-8'
data = data[3:]
if bom_enc is not None:

View File

@ -1,5 +1,7 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Read content from ereader pdb file with a 116 and 202 byte header created by Makebook.
'''
@ -57,15 +59,17 @@ class Reader202(FormatReader):
def decompress_text(self, number):
from calibre.ebooks.compression.palmdoc import decompress_doc
return decompress_doc(''.join([chr(ord(x) ^ 0xA5) for x in self.section_data(number)])).decode('cp1252' if self.encoding is None else self.encoding, 'replace') # noqa
data = bytearray(self.section_data(number))
data = bytes(bytearray(x ^ 0xA5 for x in data))
return decompress_doc(data).decode(self.encoding or 'cp1252', 'replace')
def get_image(self, number):
name = None
img = None
data = self.section_data(number)
if data.startswith('PNG'):
name = data[4:4 + 32].strip('\x00')
if data.startswith(b'PNG'):
name = data[4:4 + 32].strip(b'\x00')
img = data[62:]
return name, img
@ -89,7 +93,7 @@ class Reader202(FormatReader):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
pml = u''
pml = ''
for i in range(1, self.header_record.num_text_pages + 1):
self.log.debug('Extracting text page %i' % i)
pml += self.get_text_page(i)
@ -98,7 +102,7 @@ class Reader202(FormatReader):
if not isinstance(title, unicode_type):
title = title.decode('utf-8', 'replace')
html = u'<html><head><title>%s</title></head><body>%s</body></html>' % \
html = '<html><head><title>%s</title></head><body>%s</body></html>' % \
(title, pml_to_html(pml))
with CurrentDir(output_dir):

View File

@ -10,7 +10,7 @@ import re
from itertools import groupby
from operator import itemgetter
from collections import Counter, OrderedDict
from polyglot.builtins import iteritems, map, zip, unicode_type
from polyglot.builtins import iteritems, map, zip, unicode_type, codepoint_to_chr
from calibre import as_unicode
from calibre.ebooks.pdf.render.common import (Array, String, Stream,
@ -123,7 +123,7 @@ class Font(object):
self.metrics, self.compress = metrics, compress
self.is_otf = self.metrics.is_otf
self.subset_tag = unicode_type(
re.sub('.', lambda m: chr(int(m.group())+ord('A')), oct(num).replace('o', '')
re.sub('.', lambda m: codepoint_to_chr(int(m.group())+ord('A')), oct(num).replace('o', '')
)).rjust(6, 'A')
self.font_stream = FontStream(metrics.is_otf, compress=compress)
try:

View File

@ -19,7 +19,7 @@ from calibre.constants import plugins
from calibre.gui2.widgets2 import HistoryLineEdit2
from calibre.gui2.tweak_book import tprefs
from calibre.gui2.tweak_book.widgets import Dialog, BusyCursor
from calibre.utils.icu import safe_chr as chr
from calibre.utils.icu import safe_chr as codepoint_to_chr
from calibre.utils.unicode_names import character_name_from_code, points_for_word
from polyglot.builtins import unicode_type, range, map
@ -570,7 +570,7 @@ class CharDelegate(QStyledItemDelegate):
f = option.font
f.setPixelSize(option.rect.height() - 8)
painter.setFont(f)
painter.drawText(option.rect, Qt.AlignHCenter | Qt.AlignBottom | Qt.TextSingleLine, chr(charcode))
painter.drawText(option.rect, Qt.AlignHCenter | Qt.AlignBottom | Qt.TextSingleLine, codepoint_to_chr(charcode))
def paint_non_printing(self, painter, option, charcode):
text = self.np_pat.sub(r'\n\1', non_printing[charcode])
@ -612,7 +612,7 @@ class CharView(QListView):
except (TypeError, ValueError):
pass
else:
self.char_selected.emit(chr(char_code))
self.char_selected.emit(codepoint_to_chr(char_code))
def set_allow_drag_and_drop(self, enabled):
if not enabled:
@ -663,9 +663,9 @@ class CharView(QListView):
pass
else:
m = QMenu(self)
m.addAction(QIcon(I('edit-copy.png')), _('Copy %s to clipboard') % chr(char_code), partial(self.copy_to_clipboard, char_code))
m.addAction(QIcon(I('edit-copy.png')), _('Copy %s to clipboard') % codepoint_to_chr(char_code), partial(self.copy_to_clipboard, char_code))
m.addAction(QIcon(I('rating.png')),
(_('Remove %s from favorites') if self.showing_favorites else _('Add %s to favorites')) % chr(char_code),
(_('Remove %s from favorites') if self.showing_favorites else _('Add %s to favorites')) % codepoint_to_chr(char_code),
partial(self.remove_from_favorites, char_code))
if self.showing_favorites:
m.addAction(_('Restore favorites to defaults'), self.restore_defaults)
@ -679,7 +679,7 @@ class CharView(QListView):
def copy_to_clipboard(self, char_code):
c = QApplication.clipboard()
c.setText(chr(char_code))
c.setText(codepoint_to_chr(char_code))
def remove_from_favorites(self, char_code):
existing = tprefs['charmap_favorites']