Implement the --linearize-tables transform.

This commit is contained in:
Kovid Goyal 2009-04-15 14:38:45 -07:00
parent 9de708c70b
commit 35e8e347fe
4 changed files with 50 additions and 4 deletions

View File

@ -143,7 +143,7 @@ class OutputProfile(Plugin):
# ADE dies an agonizing, long drawn out death if HTML files have more # ADE dies an agonizing, long drawn out death if HTML files have more
# bytes than this. # bytes than this.
flow_size = sys.maxint flow_size = -1
# ADE runs screaming when it sees these characters # ADE runs screaming when it sees these characters
remove_special_chars = re.compile(u'[\u200b\u00ad]') remove_special_chars = re.compile(u'[\u200b\u00ad]')
# ADE falls to the ground in a dead faint when it sees an <object> # ADE falls to the ground in a dead faint when it sees an <object>

View File

@ -94,7 +94,8 @@ OptionRecommendation(name='font_size_mapping',
OptionRecommendation(name='line_height', OptionRecommendation(name='line_height',
recommended_value=None, level=OptionRecommendation.LOW, recommended_value=None, level=OptionRecommendation.LOW,
help=_('The line height in pts. Controls spacing between consecutive ' help=_('The line height in pts. Controls spacing between consecutive '
'lines of text. By default ??' 'lines of text. By default no line height manipulation is '
'performed.'
) )
), ),
@ -102,12 +103,25 @@ OptionRecommendation(name='linearize_tables',
recommended_value=False, level=OptionRecommendation.LOW, recommended_value=False, level=OptionRecommendation.LOW,
help=_('Some badly designed documents use tables to control the ' help=_('Some badly designed documents use tables to control the '
'layout of text on the page. When converted these documents ' 'layout of text on the page. When converted these documents '
'often have text that runs of the page and other artifacts. ' 'often have text that runs off the page and other artifacts. '
'This option will extract the content from the tables and ' 'This option will extract the content from the tables and '
'present it in a linear fashion.' 'present it in a linear fashion.'
) )
), ),
OptionRecommendation(name='dont_split_on_page_breaks',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Turn off splitting at page breaks. Normally, input '
'files are automatically split at every page break into '
'two files. This gives an output ebook that can be '
'parsed faster and with less resources. However, '
'splitting is slow and if your source file contains a '
'very large number of page breaks, you should turn off '
'splitting on page breaks.'
)
),
OptionRecommendation(name='read_metadata_from_opf', OptionRecommendation(name='read_metadata_from_opf',
recommended_value=None, level=OptionRecommendation.LOW, recommended_value=None, level=OptionRecommendation.LOW,
short_switch='m', short_switch='m',
@ -330,6 +344,17 @@ OptionRecommendation(name='language',
untable=self.opts.linearize_tables) untable=self.opts.linearize_tables)
flattener(self.oeb, self.opts) flattener(self.oeb, self.opts)
if self.opts.linearize_tables:
from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables
LinearizeTables()(self.oeb, self.opts)
from calibre.ebooks.oeb.transforms.split import Split
pbx = accelerators.get('pagebreaks', None)
split = Split(not self.opts.dont_split_on_page_breaks,
max_flow_size=self.opts.output_profile.flow_size,
page_breaks_xpath=pbx)
split(self.oeb, self.opts)
from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer
self.log.info('Cleaning up manifest...') self.log.info('Cleaning up manifest...')

View File

@ -29,5 +29,5 @@ class MOBIInput(InputFormatPlugin):
with open(f, 'wb') as q: with open(f, 'wb') as q:
q.write(html.tostring(root, encoding='utf-8', method='xml', q.write(html.tostring(root, encoding='utf-8', method='xml',
include_meta_content_type=False)) include_meta_content_type=False))
accelerators['pagebreaks'] = {f: '//*[@class="mbp_pagebreak"]'} accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
return mr.created_opf_path return mr.created_opf_path

View File

@ -0,0 +1,21 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.ebooks.oeb.base import OEB_DOCS, XPNSMAP
class LinearizeTables(object):
def linearize(self, root):
for x in root.xpath('//h:table|//h:td|//h:tr|//h:th',
namespaces=XPNSMAP):
x.tag = 'div'
def __call__(self, oeb, context):
for x in oeb.manifest.items:
if x.media_type in OEB_DOCS:
self.linearize(x.data)