mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Finished testing the KF8 chunker
This commit is contained in:
parent
eb2d0761b0
commit
abe11a17dc
@ -13,6 +13,7 @@ from collections import namedtuple
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.ebooks.oeb.base import XHTML_NS
|
from calibre.ebooks.oeb.base import XHTML_NS
|
||||||
|
from calibre.constants import ispy3
|
||||||
|
|
||||||
CHUNK_SIZE = 8192
|
CHUNK_SIZE = 8192
|
||||||
|
|
||||||
@ -48,6 +49,24 @@ def node_from_path(root, path):
|
|||||||
parent = parent[idx]
|
parent = parent[idx]
|
||||||
return parent
|
return parent
|
||||||
|
|
||||||
|
mychr = chr if ispy3 else unichr
|
||||||
|
|
||||||
|
def tostring(raw, **kwargs):
|
||||||
|
''' lxml *sometimes* represents non-ascii characters as hex entities in
|
||||||
|
attribute values. I can't figure out exactly what circumstances cause it.
|
||||||
|
It seems to happen when serializing a part of a larger tree. Since we need
|
||||||
|
serialization to be the same when serializing full and partial trees, we
|
||||||
|
manually replace all hex entities with their unicode codepoints. '''
|
||||||
|
|
||||||
|
xml_declaration = kwargs.pop('xml_declaration', False)
|
||||||
|
kwargs['encoding'] = unicode
|
||||||
|
kwargs['xml_declaration'] = False
|
||||||
|
ans = etree.tostring(raw, **kwargs)
|
||||||
|
if xml_declaration:
|
||||||
|
ans = '<?xml version="1.0" encoding="UTF-8"?>\n' + ans
|
||||||
|
return re.sub(r'&#x([0-9A-Fa-f]+);', lambda m:mychr(int(m.group(1), 16)),
|
||||||
|
ans)
|
||||||
|
|
||||||
class Chunk(object):
|
class Chunk(object):
|
||||||
|
|
||||||
def __init__(self, raw):
|
def __init__(self, raw):
|
||||||
@ -63,6 +82,12 @@ class Chunk(object):
|
|||||||
self.raw += chunk.raw
|
self.raw += chunk.raw
|
||||||
self.ends_tags = chunk.ends_tags
|
self.ends_tags = chunk.ends_tags
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return 'Chunk(len=%r insert_pos=%r starts_tags=%r ends_tags=%r)'%(
|
||||||
|
len(self.raw), self.insert_pos, self.starts_tags, self.ends_tags)
|
||||||
|
|
||||||
|
__str__ = __repr__
|
||||||
|
|
||||||
class Skeleton(object):
|
class Skeleton(object):
|
||||||
|
|
||||||
def __init__(self, file_number, item, root, chunks):
|
def __init__(self, file_number, item, root, chunks):
|
||||||
@ -76,8 +101,8 @@ class Skeleton(object):
|
|||||||
self.calculate_insert_positions()
|
self.calculate_insert_positions()
|
||||||
|
|
||||||
def render(self, root):
|
def render(self, root):
|
||||||
raw = etree.tostring(root, encoding='UTF-8', xml_declaration=True)
|
raw = tostring(root, xml_declaration=True)
|
||||||
raw = raw.replace('<html', '<html xmlns="%s"'%XHTML_NS, 1)
|
raw = raw.replace(b'<html', bytes('<html xmlns="%s"'%XHTML_NS), 1)
|
||||||
return raw
|
return raw
|
||||||
|
|
||||||
def calculate_metrics(self, root):
|
def calculate_metrics(self, root):
|
||||||
@ -85,8 +110,7 @@ class Skeleton(object):
|
|||||||
self.metrics = {}
|
self.metrics = {}
|
||||||
for tag in root.xpath('//*[@aid]'):
|
for tag in root.xpath('//*[@aid]'):
|
||||||
text = (tag.text or '').encode('utf-8')
|
text = (tag.text or '').encode('utf-8')
|
||||||
raw = etree.tostring(tag, encoding='UTF-8', with_tail=True,
|
raw = tostring(tag, with_tail=True)
|
||||||
xml_declaration=False)
|
|
||||||
start_length = len(raw.partition(b'>')[0]) + len(text) + 1
|
start_length = len(raw.partition(b'>')[0]) + len(text) + 1
|
||||||
end_length = len(raw.rpartition(b'<')[-1]) + 1
|
end_length = len(raw.rpartition(b'<')[-1]) + 1
|
||||||
self.metrics[tag.get('aid')] = Metric(start_length, end_length)
|
self.metrics[tag.get('aid')] = Metric(start_length, end_length)
|
||||||
@ -101,6 +125,13 @@ class Skeleton(object):
|
|||||||
for tag in chunk.ends_tags:
|
for tag in chunk.ends_tags:
|
||||||
pos += self.metrics[tag].end
|
pos += self.metrics[tag].end
|
||||||
|
|
||||||
|
def rebuild(self):
|
||||||
|
ans = self.skeleton
|
||||||
|
for chunk in self.chunks:
|
||||||
|
i = chunk.insert_pos
|
||||||
|
ans = ans[:i] + chunk.raw + ans[i:]
|
||||||
|
return ans
|
||||||
|
|
||||||
class Chunker(object):
|
class Chunker(object):
|
||||||
|
|
||||||
def __init__(self, oeb, data_func):
|
def __init__(self, oeb, data_func):
|
||||||
@ -109,10 +140,20 @@ class Chunker(object):
|
|||||||
|
|
||||||
self.skeletons = []
|
self.skeletons = []
|
||||||
|
|
||||||
|
# Set this to a list to enable dumping of the original and rebuilt
|
||||||
|
# html files for debugging
|
||||||
|
self.orig_dumps = []
|
||||||
|
|
||||||
for i, item in enumerate(self.oeb.spine):
|
for i, item in enumerate(self.oeb.spine):
|
||||||
root = self.remove_namespaces(self.data(item))
|
root = self.remove_namespaces(self.data(item))
|
||||||
body = root.xpath('//body')[0]
|
body = root.xpath('//body')[0]
|
||||||
body.tail = '\n'
|
body.tail = '\n'
|
||||||
|
if self.orig_dumps is not None:
|
||||||
|
self.orig_dumps.append(tostring(root, xml_declaration=True,
|
||||||
|
with_tail=True))
|
||||||
|
self.orig_dumps[-1] = close_self_closing_tags(
|
||||||
|
self.orig_dumps[-1].replace(b'<html',
|
||||||
|
bytes('<html xmlns="%s"'%XHTML_NS), 1))
|
||||||
|
|
||||||
# First pass: break up document into rendered strings of length no
|
# First pass: break up document into rendered strings of length no
|
||||||
# more than CHUNK_SIZE
|
# more than CHUNK_SIZE
|
||||||
@ -128,6 +169,9 @@ class Chunker(object):
|
|||||||
# for all chunks
|
# for all chunks
|
||||||
self.skeletons.append(Skeleton(i, item, root, chunks))
|
self.skeletons.append(Skeleton(i, item, root, chunks))
|
||||||
|
|
||||||
|
if self.orig_dumps:
|
||||||
|
self.dump()
|
||||||
|
|
||||||
def remove_namespaces(self, root):
|
def remove_namespaces(self, root):
|
||||||
lang = None
|
lang = None
|
||||||
for attr, val in root.attrib.iteritems():
|
for attr, val in root.attrib.iteritems():
|
||||||
@ -173,8 +217,7 @@ class Chunker(object):
|
|||||||
|
|
||||||
# Now loop over children
|
# Now loop over children
|
||||||
for child in list(tag):
|
for child in list(tag):
|
||||||
raw = etree.tostring(child, encoding='UTF-8',
|
raw = tostring(child, with_tail=False)
|
||||||
xml_declaration=False, with_tail=False)
|
|
||||||
raw = close_self_closing_tags(raw)
|
raw = close_self_closing_tags(raw)
|
||||||
if len(raw) > CHUNK_SIZE and child.get('aid', None):
|
if len(raw) > CHUNK_SIZE and child.get('aid', None):
|
||||||
self.step_into_tag(child, chunks)
|
self.step_into_tag(child, chunks)
|
||||||
@ -230,3 +273,19 @@ class Chunker(object):
|
|||||||
prev.merge(chunk)
|
prev.merge(chunk)
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
def dump(self):
|
||||||
|
import tempfile, shutil, os
|
||||||
|
tdir = os.path.join(tempfile.gettempdir(), 'skeleton')
|
||||||
|
self.log('Skeletons dumped to:', tdir)
|
||||||
|
if os.path.exists(tdir):
|
||||||
|
shutil.rmtree(tdir)
|
||||||
|
orig = os.path.join(tdir, 'orig')
|
||||||
|
rebuilt = os.path.join(tdir, 'rebuilt')
|
||||||
|
for x in (orig, rebuilt):
|
||||||
|
os.makedirs(x)
|
||||||
|
for i, skeleton in enumerate(self.skeletons):
|
||||||
|
with open(os.path.join(orig, '%04d.html'%i), 'wb') as f:
|
||||||
|
f.write(self.orig_dumps[i])
|
||||||
|
with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f:
|
||||||
|
f.write(skeleton.rebuild())
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user