diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 975507e2a7..f1d5d5fe1b 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -49,6 +49,8 @@ HEURISTIC_OPTIONS = ['markup_chapter_headings', 'dehyphenate', 'renumber_headings', 'replace_scene_breaks'] +DEFAULT_TRUE_OPTIONS = HEURISTIC_OPTIONS + ['remove_fake_margins'] + def print_help(parser, log): help = parser.format_help().encode(preferred_encoding, 'replace') log(help) @@ -90,7 +92,7 @@ def option_recommendation_to_cli_option(add_option, rec): if opt.long_switch == 'verbose': attrs['action'] = 'count' attrs.pop('type', '') - if opt.name in HEURISTIC_OPTIONS and rec.recommended_value is True: + if opt.name in DEFAULT_TRUE_OPTIONS and rec.recommended_value is True: switches = ['--disable-'+opt.long_switch] add_option(Option(*switches, **attrs)) @@ -162,6 +164,7 @@ def add_pipeline_options(parser, plumber): 'chapter', 'chapter_mark', 'prefer_metadata_cover', 'remove_first_image', 'insert_metadata', 'page_breaks_before', + 'remove_fake_margins', ] ), diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 9a0c3f3c7f..6272e7b10b 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -304,6 +304,17 @@ OptionRecommendation(name='page_breaks_before', 'before the specified elements.') ), +OptionRecommendation(name='remove_fake_margins', + recommended_value=True, level=OptionRecommendation.LOW, + help=_('Some documents specify page margins by ' + 'specifying a left and right margin on each individual ' + 'paragraph. calibre will try to detect and remove these ' + 'margins. Sometimes, this can cause the removal of ' + 'margins that should not have been removed. In this ' + 'case you can disable the removal.') + ), + + OptionRecommendation(name='margin_top', recommended_value=5.0, level=OptionRecommendation.LOW, help=_('Set the top margin in pts. Default is %default. ' @@ -988,9 +999,13 @@ OptionRecommendation(name='sr3_replace', page_break_on_body=self.output_plugin.file_type in ('mobi', 'lit')) flattener(self.oeb, self.opts) + self.opts.insert_blank_line = oibl self.opts.remove_paragraph_spacing = orps + from calibre.ebooks.oeb.transforms.page_margin import RemoveFakeMargins + RemoveFakeMargins()(self.oeb, self.log, self.opts) + pr(0.9) self.flush() diff --git a/src/calibre/ebooks/oeb/transforms/page_margin.py b/src/calibre/ebooks/oeb/transforms/page_margin.py new file mode 100644 index 0000000000..589f004dd1 --- /dev/null +++ b/src/calibre/ebooks/oeb/transforms/page_margin.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from collections import Counter + +from calibre.ebooks.oeb.base import OEB_STYLES, barename, XPath + +class RemoveFakeMargins(object): + + ''' + Remove left and right margins from paragraph/divs if the same margin is specified + on almost all the elements of at that level. + + Must be called only after CSS flattening + ''' + + def __call__(self, oeb, log, opts): + if not opts.remove_fake_margins: + return + self.oeb, self.log, self.opts = oeb, log, opts + stylesheet = None + self.levels = {} + self.stats = {} + self.selector_map = {} + + for item in self.oeb.manifest: + if item.media_type.lower() in OEB_STYLES: + stylesheet = item + break + if stylesheet is None: + return + + self.log('Removing fake margins...') + + stylesheet = stylesheet.data + + from cssutils.css import CSSRule + for rule in stylesheet.cssRules.rulesOfType(CSSRule.STYLE_RULE): + self.selector_map[rule.selectorList.selectorText] = rule.style + + self.find_levels() + + for level in self.levels: + self.process_level(level) + + def get_margins(self, elem): + cls = elem.get('class', None) + if cls: + style = self.selector_map.get('.'+cls, None) + if style: + return style.marginLeft, style.marginRight, style + return '', '', None + + + def process_level(self, level): + elems = self.levels[level] + self.stats[level+'_left'] = Counter() + self.stats[level+'_right'] = Counter() + + for elem in elems: + lm, rm = self.get_margins(elem)[:2] + self.stats[level+'_left'][lm] += 1 + self.stats[level+'_right'][rm] += 1 + + self.log.debug(level, ' left margin stats:', self.stats[level+'_left']) + self.log.debug(level, ' right margin stats:', self.stats[level+'_right']) + + remove_left = self.analyze_stats(self.stats[level+'_left']) + remove_right = self.analyze_stats(self.stats[level+'_right']) + + + if remove_left: + mcl = self.stats[level+'_left'].most_common(1)[0][0] + self.log('Removing level %s left margin of:'%level, mcl) + + if remove_right: + mcr = self.stats[level+'_right'].most_common(1)[0][0] + self.log('Removing level %s right margin of:'%level, mcr) + + if remove_left or remove_right: + for elem in elems: + lm, rm, style = self.get_margins(elem) + if remove_left and lm == mcl: + style.removeProperty('margin-left') + if remove_right and rm == mcr: + style.removeProperty('margin-right') + + def find_levels(self): + + def level_of(elem, body): + ans = 1 + while elem.getparent() is not body: + ans += 1 + elem = elem.getparent() + return ans + + paras = XPath('descendant::h:p|descendant::h:div') + + for item in self.oeb.spine: + body = XPath('//h:body')(item.data) + if not body: + continue + body = body[0] + + for p in paras(body): + level = level_of(p, body) + level = '%s_%d'%(barename(p.tag), level) + if level not in self.levels: + self.levels[level] = [] + self.levels[level].append(p) + + remove = set() + for k, v in self.levels.iteritems(): + num = len(v) + self.log.debug('Found %d items of level:'%num, k) + level = int(k.split('_')[-1]) + tag = k.split('_')[0] + if tag == 'p' and num < 25: + remove.add(k) + if tag == 'div': + if level > 2 and num < 25: + remove.add(k) + elif level < 3: + # Check each level < 3 element and only keep those + # that have many child paras + for elem in list(v): + children = len(paras(elem)) + if children < 5: + v.remove(elem) + + for k in remove: + self.levels.pop(k) + self.log.debug('Ignoring level', k) + + def analyze_stats(self, stats): + if not stats: + return False + mc = stats.most_common(1) + if len(mc) > 1: + return False + mc = mc[0] + most_common, most_common_count = mc + if not most_common or most_common == '0': + return False + total = sum(stats.values()) + # True if greater than 95% of elements have the same margin + return most_common_count/total > 0.95 diff --git a/src/calibre/gui2/convert/structure_detection.py b/src/calibre/gui2/convert/structure_detection.py index d8e2f4f122..b58c473bd4 100644 --- a/src/calibre/gui2/convert/structure_detection.py +++ b/src/calibre/gui2/convert/structure_detection.py @@ -21,7 +21,7 @@ class StructureDetectionWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, ['chapter', 'chapter_mark', - 'remove_first_image', + 'remove_first_image', 'remove_fake_margins', 'insert_metadata', 'page_breaks_before'] ) self.db, self.book_id = db, book_id diff --git a/src/calibre/gui2/convert/structure_detection.ui b/src/calibre/gui2/convert/structure_detection.ui index f80e6f8182..4ba90c1c2c 100644 --- a/src/calibre/gui2/convert/structure_detection.ui +++ b/src/calibre/gui2/convert/structure_detection.ui @@ -48,10 +48,10 @@ - + - + Qt::Vertical @@ -77,7 +77,7 @@ - + The header and footer removal options have been replaced by the Search & Replace options. Click the Search & Replace category in the bar to the left to use these options. Leave the replace field blank and enter your header/footer removal regexps into the search field. @@ -87,6 +87,13 @@ + + + + Remove &fake margins + + +