Implement the --linearize-tables transform.

2025-07-09 03:04:10 -04:00 · 2009-04-15 14:38:45 -07:00 · 2009-04-15 14:38:45 -07:00 · 35e8e347fe
commit 35e8e347fe
parent 9de708c70b
4 changed files with 50 additions and 4 deletions
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@ -143,7 +143,7 @@ class OutputProfile(Plugin):
    # ADE dies an agonizing, long drawn out death if HTML files have more
    # bytes than this.
-    flow_size                 = sys.maxint
+    flow_size                 = -1
    # ADE runs screaming when it sees these characters
    remove_special_chars      = re.compile(u'[\u200b\u00ad]')
    # ADE falls to the ground in a dead faint when it sees an <object>
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -94,7 +94,8 @@ OptionRecommendation(name='font_size_mapping',
 OptionRecommendation(name='line_height',
            recommended_value=None, level=OptionRecommendation.LOW,
            help=_('The line height in pts. Controls spacing between consecutive '
-                   'lines of text. By default ??'
+                   'lines of text. By default no line height manipulation is '
                   'performed.'
                   )
        ),
@ -102,12 +103,25 @@ OptionRecommendation(name='linearize_tables',
            recommended_value=False, level=OptionRecommendation.LOW,
            help=_('Some badly designed documents use tables to control the '
                'layout of text on the page. When converted these documents '
-                'often have text that runs of the page and other artifacts. '
+                'often have text that runs off the page and other artifacts. '
                'This option will extract the content from the tables and '
                'present it in a linear fashion.'
                )
        ),
 OptionRecommendation(name='dont_split_on_page_breaks',
            recommended_value=False, level=OptionRecommendation.LOW,
            help=_('Turn off splitting at page breaks. Normally, input '
                    'files are automatically split at every page break into '
                    'two files. This gives an output ebook that can be '
                    'parsed faster and with less resources. However, '
                    'splitting is slow and if your source file contains a '
                    'very large number of page breaks, you should turn off '
                    'splitting on page breaks.'
                )
        ),
 OptionRecommendation(name='read_metadata_from_opf',
            recommended_value=None, level=OptionRecommendation.LOW,
            short_switch='m',
@ -330,6 +344,17 @@ OptionRecommendation(name='language',
                untable=self.opts.linearize_tables)
        flattener(self.oeb, self.opts)
        if self.opts.linearize_tables:
            from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables
            LinearizeTables()(self.oeb, self.opts)
        from calibre.ebooks.oeb.transforms.split import Split
        pbx = accelerators.get('pagebreaks', None)
        split = Split(not self.opts.dont_split_on_page_breaks,
                max_flow_size=self.opts.output_profile.flow_size,
                page_breaks_xpath=pbx)
        split(self.oeb, self.opts)
        from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer
        self.log.info('Cleaning up manifest...')
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@ -29,5 +29,5 @@ class MOBIInput(InputFormatPlugin):
            with open(f, 'wb') as q:
                q.write(html.tostring(root, encoding='utf-8', method='xml',
                    include_meta_content_type=False))
-            accelerators['pagebreaks'] = {f: '//*[@class="mbp_pagebreak"]'}
+                accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
        return mr.created_opf_path
--- a/src/calibre/ebooks/oeb/transforms/linearize_tables.py
+++ b/src/calibre/ebooks/oeb/transforms/linearize_tables.py
@ -0,0 +1,21 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from calibre.ebooks.oeb.base import OEB_DOCS, XPNSMAP
 class LinearizeTables(object):
    def linearize(self, root):
        for x in root.xpath('//h:table|//h:td|//h:tr|//h:th',
                namespaces=XPNSMAP):
            x.tag = 'div'
    def __call__(self, oeb, context):
        for x in oeb.manifest.items:
            if x.media_type in OEB_DOCS:
                self.linearize(x.data)