From 7800024bac39d901c575f4369dd4528691faaf90 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 20:04:13 +0530 Subject: [PATCH 1/5] ... --- src/calibre/ebooks/mobi/writer8/main.py | 61 ++++++++++++++++++--- src/calibre/ebooks/mobi/writer8/skeleton.py | 42 +------------- 2 files changed, 57 insertions(+), 46 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py index b924a4df7c..d8ef501eb6 100644 --- a/src/calibre/ebooks/mobi/writer8/main.py +++ b/src/calibre/ebooks/mobi/writer8/main.py @@ -9,13 +9,16 @@ __docformat__ = 'restructuredtext en' import copy from functools import partial -from collections import defaultdict +from collections import defaultdict, namedtuple +from io import BytesIO +from struct import pack import cssutils from lxml import etree from calibre import isbytestring, force_unicode -from calibre.ebooks.mobi.utils import to_base +from calibre.ebooks.mobi.utils import create_text_record, to_base +from calibre.ebooks.compression.palmdoc import compress_doc from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath, extract, XHTML, urlnormalize) from calibre.ebooks.oeb.parse_utils import barename @@ -31,11 +34,14 @@ class KF8Writer(object): def __init__(self, oeb, opts, resources): self.oeb, self.opts, self.log = oeb, opts, oeb.log + self.compress = not self.opts.dont_compress self.log.info('Creating KF8 output') self.used_images = set() self.resources = resources self.dup_data() self.flows = [None] # First flow item is reserved for the text + self.records = [] + self.fdst_table = [] self.replace_resource_links() self.extract_css_into_flows() @@ -43,6 +49,10 @@ class KF8Writer(object): self.replace_internal_links_with_placeholders() self.insert_aid_attributes() self.chunk_it_up() + # Dump the cloned data as it is no longer needed + del self._data_cache + self.create_text_records() + self.create_fdst_table() def dup_data(self): ''' Duplicate data so that any changes we make to markup/CSS only @@ -205,12 +215,49 @@ class KF8Writer(object): if aid is None: aid = self.id_map.get((href, '')) placeholder_map[placeholder] = aid - chunker = Chunker(self.oeb, self.data, not self.opts.dont_compress, - placeholder_map) + chunker = Chunker(self.oeb, self.data, placeholder_map) - for x in ('skel_table', 'chunk_table', 'aid_offset_map', 'records', - 'last_text_record_idx', 'first_non_text_record_idx', - 'text_length'): + for x in ('skel_table', 'chunk_table', 'aid_offset_map'): setattr(self, x, getattr(chunker, x)) + self.flows[0] = chunker.text + + def create_text_records(self): + self.flows = [x.encode('utf-8') if isinstance(x, unicode) else x for x + in self.flows] + text = b''.join(self.flows) + self.text_length = len(text) + text = BytesIO(text) + nrecords = 0 + records_size = 0 + + if self.compress: + self.oeb.logger.info(' Compressing markup content...') + + while text.tell() < self.text_length: + data, overlap = create_text_record(text) + if self.compress: + data = compress_doc(data) + + data += overlap + data += pack(b'>B', len(overlap)) + + self.records.append(data) + records_size += len(data) + nrecords += 1 + + self.last_text_record_idx = nrecords + self.first_non_text_record_idx = nrecords + 1 + # Pad so that the next records starts at a 4 byte boundary + if records_size % 4 != 0: + self.records.append(b'\x00'*(records_size % 4)) + self.first_non_text_record_idx += 1 + + def create_fdst_table(self): + FDST = namedtuple('Flow', 'start end') + for i, flow in enumerate(self.flows): + start = 0 if i == 0 else self.fdst_table[-1].end + self.fdst_table.append(FDST(start, start + len(flow))) + + diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index da3b9407bd..eff03c9de4 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -9,16 +9,13 @@ __docformat__ = 'restructuredtext en' import re from collections import namedtuple -from io import BytesIO -from struct import pack from functools import partial from lxml import etree from calibre.ebooks.oeb.base import XHTML_NS from calibre.constants import ispy3 -from calibre.ebooks.mobi.utils import create_text_record, to_base -from calibre.ebooks.compression.palmdoc import compress_doc +from calibre.ebooks.mobi.utils import to_base CHUNK_SIZE = 8192 @@ -159,10 +156,9 @@ class Skeleton(object): class Chunker(object): - def __init__(self, oeb, data_func, compress, placeholder_map): + def __init__(self, oeb, data_func, placeholder_map): self.oeb, self.log = oeb, oeb.log self.data = data_func - self.compress = compress self.placeholder_map = placeholder_map self.skeletons = [] @@ -207,11 +203,7 @@ class Chunker(object): # Set internal links text = b''.join(x.raw_text for x in self.skeletons) - text = self.set_internal_links(text) - - # Create text records - self.records = [] - self.create_text_records(text) + self.text = self.set_internal_links(text) def remove_namespaces(self, root): lang = None @@ -364,34 +356,6 @@ class Chunker(object): return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text) - def create_text_records(self, text): - self.text_length = len(text) - text = BytesIO(text) - nrecords = 0 - records_size = 0 - - if self.compress: - self.oeb.logger.info(' Compressing markup content...') - - while text.tell() < self.text_length: - data, overlap = create_text_record(text) - if self.compress: - data = compress_doc(data) - - data += overlap - data += pack(b'>B', len(overlap)) - - self.records.append(data) - records_size += len(data) - nrecords += 1 - - self.last_text_record_idx = nrecords - self.first_non_text_record_idx = nrecords + 1 - # Pad so that the next records starts at a 4 byte boundary - if records_size % 4 != 0: - self.records.append(b'\x00'*(records_size % 4)) - self.first_non_text_record_idx += 1 - def dump(self): import tempfile, shutil, os tdir = os.path.join(tempfile.gettempdir(), 'skeleton') From fbcd3eb279b8acd0d450d42069827fda8d9c9d0f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 20:11:12 +0530 Subject: [PATCH 2/5] ... --- recipes/tpm_uk.recipe | 2 ++ src/calibre/ebooks/mobi/writer8/main.py | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes/tpm_uk.recipe b/recipes/tpm_uk.recipe index aa042de951..0ccad32fa9 100644 --- a/recipes/tpm_uk.recipe +++ b/recipes/tpm_uk.recipe @@ -11,6 +11,8 @@ class TPM_uk(BasicNewsRecipe): __author__ = 'Darko Miletic' description = 'Title says it all' publisher = "The Philosophers' Magazine" + recipe_disabled = ('This recipe has been disabled as the website has' + ' started providing articles only in PDF form') category = 'philosophy, news' oldest_article = 25 max_articles_per_feed = 200 diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py index d8ef501eb6..c9334b22a3 100644 --- a/src/calibre/ebooks/mobi/writer8/main.py +++ b/src/calibre/ebooks/mobi/writer8/main.py @@ -259,5 +259,3 @@ class KF8Writer(object): start = 0 if i == 0 else self.fdst_table[-1].end self.fdst_table.append(FDST(start, start + len(flow))) - - From 03ed4010f58ebb8499d0b9d49ecc5c275214a3d4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 20:49:23 +0530 Subject: [PATCH 3/5] ... --- src/calibre/ebooks/mobi/writer8/skeleton.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index eff03c9de4..4b39d0cb15 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -165,7 +165,7 @@ class Chunker(object): # Set this to a list to enable dumping of the original and rebuilt # html files for debugging - self.orig_dumps = None + orig_dumps = None for i, item in enumerate(self.oeb.spine): root = self.remove_namespaces(self.data(item)) @@ -193,8 +193,8 @@ class Chunker(object): # for all chunks self.skeletons.append(Skeleton(i, item, root, chunks)) - if self.orig_dumps: - self.dump() + if orig_dumps: + self.dump(orig_dumps) # Create the SKEL and Chunk tables self.skel_table = [] @@ -356,7 +356,7 @@ class Chunker(object): return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text) - def dump(self): + def dump(self, orig_dumps): import tempfile, shutil, os tdir = os.path.join(tempfile.gettempdir(), 'skeleton') self.log('Skeletons dumped to:', tdir) @@ -368,7 +368,7 @@ class Chunker(object): os.makedirs(x) for i, skeleton in enumerate(self.skeletons): with open(os.path.join(orig, '%04d.html'%i), 'wb') as f: - f.write(self.orig_dumps[i]) + f.write(orig_dumps[i]) with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f: f.write(skeleton.rebuild()) From 8d44e8d83f4b7f84051463117cef5cfcfdad5252 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 21:08:27 +0530 Subject: [PATCH 4/5] ... --- src/calibre/ebooks/mobi/writer8/main.py | 9 ++++----- src/calibre/ebooks/mobi/writer8/skeleton.py | 9 +++++++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py index c9334b22a3..430d695fd1 100644 --- a/src/calibre/ebooks/mobi/writer8/main.py +++ b/src/calibre/ebooks/mobi/writer8/main.py @@ -175,7 +175,7 @@ class KF8Writer(object): self.link_map = {} count = 0 hrefs = {item.href for item in self.oeb.spine} - for i, item in enumerate(self.oeb.spine): + for item in self.oeb.spine: root = self.data(item) for a in XPath('//h:a[@href]')(root): @@ -184,8 +184,7 @@ class KF8Writer(object): href, _, frag = ref.partition('#') href = urlnormalize(href) if href in hrefs: - placeholder = 'kindle:pos:fid:%04d:off:%s'%(i, - to_href(count)) + placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count) self.link_map[placeholder] = (href, frag) a.set('href', placeholder) @@ -201,9 +200,9 @@ class KF8Writer(object): aid = aidbase + j tag.attrib['aid'] = to_base(aid, base=32) if tag.tag == XHTML('body'): - self.id_map[(item.href, '')] = tag.attrib['aid'] + self.id_map[(item.href, '')] = (i, tag.attrib['aid']) if id_ is not None: - self.id_map[(item.href, id_)] = tag.attrib['aid'] + self.id_map[(item.href, id_)] = (i, tag.attrib['aid']) j += 1 diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index 4b39d0cb15..494aa30def 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -341,7 +341,12 @@ class Chunker(object): for match in re.finditer(br'<[^>]+? aid=[\'"]([A-Z0-9]+)[\'"]', text): aid_map[match.group(1)] = match.start() self.aid_offset_map = aid_map - placeholder_map = {bytes(k):bytes(to_href(aid_map[v])) for k, v in + + def to_placeholder(x): + file_number, aid = x + return bytes('%04d:%s'%(file_number, to_href(aid_map[aid]))) + + placeholder_map = {bytes(k):to_placeholder(v) for k, v in self.placeholder_map.iteritems()} # Now update the links @@ -349,7 +354,7 @@ class Chunker(object): raw = match.group() pl = match.group(1) try: - return raw[:-10] + placeholder_map[pl] + return raw[:-15] + placeholder_map[pl] except KeyError: pass return raw From 6c631e0e64ce2ce7604367ebed60457d51924af2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 21:15:12 +0530 Subject: [PATCH 5/5] ... --- src/calibre/ebooks/mobi/writer8/skeleton.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index 494aa30def..d04f119316 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -371,10 +371,19 @@ class Chunker(object): rebuilt = os.path.join(tdir, 'rebuilt') for x in (orig, rebuilt): os.makedirs(x) + error = False for i, skeleton in enumerate(self.skeletons): + oraw, rraw = orig_dumps[i], skeleton.rebuild() with open(os.path.join(orig, '%04d.html'%i), 'wb') as f: - f.write(orig_dumps[i]) + f.write(oraw) with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f: - f.write(skeleton.rebuild()) + f.write(rraw) + if oraw != rraw: + error = True + if error: + raise ValueError('The before and after HTML differs. Run a diff ' + 'tool on the orig and rebuilt directories') + else: + self.log('Skeleton HTML before and after is identical.')