From 35e8e347fea42005c861675d58b82e3559984066 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 15 Apr 2009 14:38:45 -0700 Subject: [PATCH] Implement the --linearize-tables transform. --- src/calibre/customize/profiles.py | 2 +- src/calibre/ebooks/conversion/plumber.py | 29 +++++++++++++++++-- src/calibre/ebooks/mobi/input.py | 2 +- .../ebooks/oeb/transforms/linearize_tables.py | 21 ++++++++++++++ 4 files changed, 50 insertions(+), 4 deletions(-) create mode 100644 src/calibre/ebooks/oeb/transforms/linearize_tables.py diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index 8623a94ddd..c11529f025 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -143,7 +143,7 @@ class OutputProfile(Plugin): # ADE dies an agonizing, long drawn out death if HTML files have more # bytes than this. - flow_size = sys.maxint + flow_size = -1 # ADE runs screaming when it sees these characters remove_special_chars = re.compile(u'[\u200b\u00ad]') # ADE falls to the ground in a dead faint when it sees an diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index ab30e71ba1..119ae4d63e 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -94,7 +94,8 @@ OptionRecommendation(name='font_size_mapping', OptionRecommendation(name='line_height', recommended_value=None, level=OptionRecommendation.LOW, help=_('The line height in pts. Controls spacing between consecutive ' - 'lines of text. By default ??' + 'lines of text. By default no line height manipulation is ' + 'performed.' ) ), @@ -102,12 +103,25 @@ OptionRecommendation(name='linearize_tables', recommended_value=False, level=OptionRecommendation.LOW, help=_('Some badly designed documents use tables to control the ' 'layout of text on the page. When converted these documents ' - 'often have text that runs of the page and other artifacts. ' + 'often have text that runs off the page and other artifacts. ' 'This option will extract the content from the tables and ' 'present it in a linear fashion.' ) ), +OptionRecommendation(name='dont_split_on_page_breaks', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Turn off splitting at page breaks. Normally, input ' + 'files are automatically split at every page break into ' + 'two files. This gives an output ebook that can be ' + 'parsed faster and with less resources. However, ' + 'splitting is slow and if your source file contains a ' + 'very large number of page breaks, you should turn off ' + 'splitting on page breaks.' + ) + ), + + OptionRecommendation(name='read_metadata_from_opf', recommended_value=None, level=OptionRecommendation.LOW, short_switch='m', @@ -330,6 +344,17 @@ OptionRecommendation(name='language', untable=self.opts.linearize_tables) flattener(self.oeb, self.opts) + if self.opts.linearize_tables: + from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables + LinearizeTables()(self.oeb, self.opts) + + from calibre.ebooks.oeb.transforms.split import Split + pbx = accelerators.get('pagebreaks', None) + split = Split(not self.opts.dont_split_on_page_breaks, + max_flow_size=self.opts.output_profile.flow_size, + page_breaks_xpath=pbx) + split(self.oeb, self.opts) + from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer self.log.info('Cleaning up manifest...') diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py index 2eb45c9161..97d94a0e33 100644 --- a/src/calibre/ebooks/mobi/input.py +++ b/src/calibre/ebooks/mobi/input.py @@ -29,5 +29,5 @@ class MOBIInput(InputFormatPlugin): with open(f, 'wb') as q: q.write(html.tostring(root, encoding='utf-8', method='xml', include_meta_content_type=False)) - accelerators['pagebreaks'] = {f: '//*[@class="mbp_pagebreak"]'} + accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]' return mr.created_opf_path diff --git a/src/calibre/ebooks/oeb/transforms/linearize_tables.py b/src/calibre/ebooks/oeb/transforms/linearize_tables.py new file mode 100644 index 0000000000..a0c11f848c --- /dev/null +++ b/src/calibre/ebooks/oeb/transforms/linearize_tables.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from calibre.ebooks.oeb.base import OEB_DOCS, XPNSMAP + +class LinearizeTables(object): + + def linearize(self, root): + for x in root.xpath('//h:table|//h:td|//h:tr|//h:th', + namespaces=XPNSMAP): + x.tag = 'div' + + def __call__(self, oeb, context): + for x in oeb.manifest.items: + if x.media_type in OEB_DOCS: + self.linearize(x.data)