py3: Port another instance of cStringIO

This commit is contained in:
Kovid Goyal 2019-04-01 16:49:20 +05:30
parent ce007d8bab
commit 5e10e3663b
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -6,7 +6,7 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import shutil, os, re, struct, textwrap, cStringIO import shutil, os, re, struct, textwrap, io
from lxml import html, etree from lxml import html, etree
@ -294,7 +294,7 @@ class MobiReader(object):
parse_cache[htmlfile] = root parse_cache[htmlfile] = root
self.htmlfile = htmlfile self.htmlfile = htmlfile
ncx = cStringIO.StringIO() ncx = io.BytesIO()
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf' self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf'
opf.render(lopen(self.created_opf_path, 'wb'), ncx, opf.render(lopen(self.created_opf_path, 'wb'), ncx,
@ -311,7 +311,7 @@ class MobiReader(object):
if self.book_header.exth is not None or self.embedded_mi is not None: if self.book_header.exth is not None or self.embedded_mi is not None:
self.log.debug('Creating OPF...') self.log.debug('Creating OPF...')
ncx = cStringIO.StringIO() ncx = io.BytesIO()
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx, opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx,
ncx_manifest_entry) ncx_manifest_entry)
@ -320,9 +320,9 @@ class MobiReader(object):
write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx) write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)
def read_embedded_metadata(self, root, elem, guide): def read_embedded_metadata(self, root, elem, guide):
raw = '<?xml version="1.0" encoding="utf-8" ?>\n<package>' + \ raw = b'<?xml version="1.0" encoding="utf-8" ?>\n<package>' + \
html.tostring(elem, encoding='utf-8') + '</package>' html.tostring(elem, encoding='utf-8') + b'</package>'
stream = cStringIO.StringIO(raw) stream = io.BytesIO(raw)
opf = OPF(stream) opf = OPF(stream)
self.embedded_mi = opf.to_book_metadata() self.embedded_mi = opf.to_book_metadata()
if guide is not None: if guide is not None:
@ -828,37 +828,37 @@ class MobiReader(object):
def add_anchors(self): def add_anchors(self):
self.log.debug('Adding anchors...') self.log.debug('Adding anchors...')
positions = set([]) positions = set()
link_pattern = re.compile(r'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''', link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''',
re.IGNORECASE) re.IGNORECASE)
for match in link_pattern.finditer(self.mobi_html): for match in link_pattern.finditer(self.mobi_html):
positions.add(int(match.group(1))) positions.add(int(match.group(1)))
pos = 0 pos = 0
processed_html = cStringIO.StringIO() processed_html = []
end_tag_re = re.compile(r'<\s*/') end_tag_re = re.compile(br'<\s*/')
for end in sorted(positions): for end in sorted(positions):
if end == 0: if end == 0:
continue continue
oend = end oend = end
l = self.mobi_html.find('<', end) l = self.mobi_html.find(b'<', end)
r = self.mobi_html.find('>', end) r = self.mobi_html.find(b'>', end)
anchor = '<a id="filepos%d"></a>' anchor = b'<a id="filepos%d"></a>'
if r > -1 and (r < l or l == end or l == -1): if r > -1 and (r < l or l == end or l == -1):
p = self.mobi_html.rfind('<', 0, end + 1) p = self.mobi_html.rfind(b'<', 0, end + 1)
if (pos < end and p > -1 and not end_tag_re.match(self.mobi_html[p:r]) and if (pos < end and p > -1 and not end_tag_re.match(self.mobi_html[p:r]) and
not self.mobi_html[p:r + 1].endswith('/>')): not self.mobi_html[p:r + 1].endswith(b'/>')):
anchor = ' filepos-id="filepos%d"' anchor = b' filepos-id="filepos%d"'
end = r end = r
else: else:
end = r + 1 end = r + 1
processed_html.write(self.mobi_html[pos:end] + (anchor % oend)) processed_html.append(self.mobi_html[pos:end] + (anchor % oend))
pos = end pos = end
processed_html.write(self.mobi_html[pos:]) processed_html.append(self.mobi_html[pos:])
processed_html = processed_html.getvalue() processed_html = b''.join(processed_html)
# Remove anchors placed inside entities # Remove anchors placed inside entities
self.processed_html = re.sub(r'&([^;]*?)(<a id="filepos\d+"></a>)([^;]*);', self.processed_html = re.sub(br'&([^;]*?)(<a id="filepos\d+"></a>)([^;]*);',
r'&\1\3;\2', processed_html) br'&\1\3;\2', processed_html)
def extract_images(self, processed_records, output_dir): def extract_images(self, processed_records, output_dir):
self.log.debug('Extracting images...') self.log.debug('Extracting images...')