mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Implement the --linearize-tables transform.
This commit is contained in:
parent
9de708c70b
commit
35e8e347fe
@ -143,7 +143,7 @@ class OutputProfile(Plugin):
|
||||
|
||||
# ADE dies an agonizing, long drawn out death if HTML files have more
|
||||
# bytes than this.
|
||||
flow_size = sys.maxint
|
||||
flow_size = -1
|
||||
# ADE runs screaming when it sees these characters
|
||||
remove_special_chars = re.compile(u'[\u200b\u00ad]')
|
||||
# ADE falls to the ground in a dead faint when it sees an <object>
|
||||
|
@ -94,7 +94,8 @@ OptionRecommendation(name='font_size_mapping',
|
||||
OptionRecommendation(name='line_height',
|
||||
recommended_value=None, level=OptionRecommendation.LOW,
|
||||
help=_('The line height in pts. Controls spacing between consecutive '
|
||||
'lines of text. By default ??'
|
||||
'lines of text. By default no line height manipulation is '
|
||||
'performed.'
|
||||
)
|
||||
),
|
||||
|
||||
@ -102,12 +103,25 @@ OptionRecommendation(name='linearize_tables',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Some badly designed documents use tables to control the '
|
||||
'layout of text on the page. When converted these documents '
|
||||
'often have text that runs of the page and other artifacts. '
|
||||
'often have text that runs off the page and other artifacts. '
|
||||
'This option will extract the content from the tables and '
|
||||
'present it in a linear fashion.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='dont_split_on_page_breaks',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Turn off splitting at page breaks. Normally, input '
|
||||
'files are automatically split at every page break into '
|
||||
'two files. This gives an output ebook that can be '
|
||||
'parsed faster and with less resources. However, '
|
||||
'splitting is slow and if your source file contains a '
|
||||
'very large number of page breaks, you should turn off '
|
||||
'splitting on page breaks.'
|
||||
)
|
||||
),
|
||||
|
||||
|
||||
OptionRecommendation(name='read_metadata_from_opf',
|
||||
recommended_value=None, level=OptionRecommendation.LOW,
|
||||
short_switch='m',
|
||||
@ -330,6 +344,17 @@ OptionRecommendation(name='language',
|
||||
untable=self.opts.linearize_tables)
|
||||
flattener(self.oeb, self.opts)
|
||||
|
||||
if self.opts.linearize_tables:
|
||||
from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables
|
||||
LinearizeTables()(self.oeb, self.opts)
|
||||
|
||||
from calibre.ebooks.oeb.transforms.split import Split
|
||||
pbx = accelerators.get('pagebreaks', None)
|
||||
split = Split(not self.opts.dont_split_on_page_breaks,
|
||||
max_flow_size=self.opts.output_profile.flow_size,
|
||||
page_breaks_xpath=pbx)
|
||||
split(self.oeb, self.opts)
|
||||
|
||||
from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer
|
||||
|
||||
self.log.info('Cleaning up manifest...')
|
||||
|
@ -29,5 +29,5 @@ class MOBIInput(InputFormatPlugin):
|
||||
with open(f, 'wb') as q:
|
||||
q.write(html.tostring(root, encoding='utf-8', method='xml',
|
||||
include_meta_content_type=False))
|
||||
accelerators['pagebreaks'] = {f: '//*[@class="mbp_pagebreak"]'}
|
||||
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
|
||||
return mr.created_opf_path
|
||||
|
21
src/calibre/ebooks/oeb/transforms/linearize_tables.py
Normal file
21
src/calibre/ebooks/oeb/transforms/linearize_tables.py
Normal file
@ -0,0 +1,21 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import with_statement
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS, XPNSMAP
|
||||
|
||||
class LinearizeTables(object):
|
||||
|
||||
def linearize(self, root):
|
||||
for x in root.xpath('//h:table|//h:td|//h:tr|//h:th',
|
||||
namespaces=XPNSMAP):
|
||||
x.tag = 'div'
|
||||
|
||||
def __call__(self, oeb, context):
|
||||
for x in oeb.manifest.items:
|
||||
if x.media_type in OEB_DOCS:
|
||||
self.linearize(x.data)
|
Loading…
x
Reference in New Issue
Block a user