mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Conversion: Detect and remove fake page margins that are specified as a margin on every paragraph. This can be turned off via an option under Structure Detection
This commit is contained in:
parent
d12b40a18e
commit
83c8257a14
@ -49,6 +49,8 @@ HEURISTIC_OPTIONS = ['markup_chapter_headings',
|
||||
'dehyphenate', 'renumber_headings',
|
||||
'replace_scene_breaks']
|
||||
|
||||
DEFAULT_TRUE_OPTIONS = HEURISTIC_OPTIONS + ['remove_fake_margins']
|
||||
|
||||
def print_help(parser, log):
|
||||
help = parser.format_help().encode(preferred_encoding, 'replace')
|
||||
log(help)
|
||||
@ -90,7 +92,7 @@ def option_recommendation_to_cli_option(add_option, rec):
|
||||
if opt.long_switch == 'verbose':
|
||||
attrs['action'] = 'count'
|
||||
attrs.pop('type', '')
|
||||
if opt.name in HEURISTIC_OPTIONS and rec.recommended_value is True:
|
||||
if opt.name in DEFAULT_TRUE_OPTIONS and rec.recommended_value is True:
|
||||
switches = ['--disable-'+opt.long_switch]
|
||||
add_option(Option(*switches, **attrs))
|
||||
|
||||
@ -162,6 +164,7 @@ def add_pipeline_options(parser, plumber):
|
||||
'chapter', 'chapter_mark',
|
||||
'prefer_metadata_cover', 'remove_first_image',
|
||||
'insert_metadata', 'page_breaks_before',
|
||||
'remove_fake_margins',
|
||||
]
|
||||
),
|
||||
|
||||
|
@ -304,6 +304,17 @@ OptionRecommendation(name='page_breaks_before',
|
||||
'before the specified elements.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='remove_fake_margins',
|
||||
recommended_value=True, level=OptionRecommendation.LOW,
|
||||
help=_('Some documents specify page margins by '
|
||||
'specifying a left and right margin on each individual '
|
||||
'paragraph. calibre will try to detect and remove these '
|
||||
'margins. Sometimes, this can cause the removal of '
|
||||
'margins that should not have been removed. In this '
|
||||
'case you can disable the removal.')
|
||||
),
|
||||
|
||||
|
||||
OptionRecommendation(name='margin_top',
|
||||
recommended_value=5.0, level=OptionRecommendation.LOW,
|
||||
help=_('Set the top margin in pts. Default is %default. '
|
||||
@ -988,9 +999,13 @@ OptionRecommendation(name='sr3_replace',
|
||||
page_break_on_body=self.output_plugin.file_type in ('mobi',
|
||||
'lit'))
|
||||
flattener(self.oeb, self.opts)
|
||||
|
||||
self.opts.insert_blank_line = oibl
|
||||
self.opts.remove_paragraph_spacing = orps
|
||||
|
||||
from calibre.ebooks.oeb.transforms.page_margin import RemoveFakeMargins
|
||||
RemoveFakeMargins()(self.oeb, self.log, self.opts)
|
||||
|
||||
pr(0.9)
|
||||
self.flush()
|
||||
|
||||
|
153
src/calibre/ebooks/oeb/transforms/page_margin.py
Normal file
153
src/calibre/ebooks/oeb/transforms/page_margin.py
Normal file
@ -0,0 +1,153 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from collections import Counter
|
||||
|
||||
from calibre.ebooks.oeb.base import OEB_STYLES, barename, XPath
|
||||
|
||||
class RemoveFakeMargins(object):
|
||||
|
||||
'''
|
||||
Remove left and right margins from paragraph/divs if the same margin is specified
|
||||
on almost all the elements of at that level.
|
||||
|
||||
Must be called only after CSS flattening
|
||||
'''
|
||||
|
||||
def __call__(self, oeb, log, opts):
|
||||
if not opts.remove_fake_margins:
|
||||
return
|
||||
self.oeb, self.log, self.opts = oeb, log, opts
|
||||
stylesheet = None
|
||||
self.levels = {}
|
||||
self.stats = {}
|
||||
self.selector_map = {}
|
||||
|
||||
for item in self.oeb.manifest:
|
||||
if item.media_type.lower() in OEB_STYLES:
|
||||
stylesheet = item
|
||||
break
|
||||
if stylesheet is None:
|
||||
return
|
||||
|
||||
self.log('Removing fake margins...')
|
||||
|
||||
stylesheet = stylesheet.data
|
||||
|
||||
from cssutils.css import CSSRule
|
||||
for rule in stylesheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
|
||||
self.selector_map[rule.selectorList.selectorText] = rule.style
|
||||
|
||||
self.find_levels()
|
||||
|
||||
for level in self.levels:
|
||||
self.process_level(level)
|
||||
|
||||
def get_margins(self, elem):
|
||||
cls = elem.get('class', None)
|
||||
if cls:
|
||||
style = self.selector_map.get('.'+cls, None)
|
||||
if style:
|
||||
return style.marginLeft, style.marginRight, style
|
||||
return '', '', None
|
||||
|
||||
|
||||
def process_level(self, level):
|
||||
elems = self.levels[level]
|
||||
self.stats[level+'_left'] = Counter()
|
||||
self.stats[level+'_right'] = Counter()
|
||||
|
||||
for elem in elems:
|
||||
lm, rm = self.get_margins(elem)[:2]
|
||||
self.stats[level+'_left'][lm] += 1
|
||||
self.stats[level+'_right'][rm] += 1
|
||||
|
||||
self.log.debug(level, ' left margin stats:', self.stats[level+'_left'])
|
||||
self.log.debug(level, ' right margin stats:', self.stats[level+'_right'])
|
||||
|
||||
remove_left = self.analyze_stats(self.stats[level+'_left'])
|
||||
remove_right = self.analyze_stats(self.stats[level+'_right'])
|
||||
|
||||
|
||||
if remove_left:
|
||||
mcl = self.stats[level+'_left'].most_common(1)[0][0]
|
||||
self.log('Removing level %s left margin of:'%level, mcl)
|
||||
|
||||
if remove_right:
|
||||
mcr = self.stats[level+'_right'].most_common(1)[0][0]
|
||||
self.log('Removing level %s right margin of:'%level, mcr)
|
||||
|
||||
if remove_left or remove_right:
|
||||
for elem in elems:
|
||||
lm, rm, style = self.get_margins(elem)
|
||||
if remove_left and lm == mcl:
|
||||
style.removeProperty('margin-left')
|
||||
if remove_right and rm == mcr:
|
||||
style.removeProperty('margin-right')
|
||||
|
||||
def find_levels(self):
|
||||
|
||||
def level_of(elem, body):
|
||||
ans = 1
|
||||
while elem.getparent() is not body:
|
||||
ans += 1
|
||||
elem = elem.getparent()
|
||||
return ans
|
||||
|
||||
paras = XPath('descendant::h:p|descendant::h:div')
|
||||
|
||||
for item in self.oeb.spine:
|
||||
body = XPath('//h:body')(item.data)
|
||||
if not body:
|
||||
continue
|
||||
body = body[0]
|
||||
|
||||
for p in paras(body):
|
||||
level = level_of(p, body)
|
||||
level = '%s_%d'%(barename(p.tag), level)
|
||||
if level not in self.levels:
|
||||
self.levels[level] = []
|
||||
self.levels[level].append(p)
|
||||
|
||||
remove = set()
|
||||
for k, v in self.levels.iteritems():
|
||||
num = len(v)
|
||||
self.log.debug('Found %d items of level:'%num, k)
|
||||
level = int(k.split('_')[-1])
|
||||
tag = k.split('_')[0]
|
||||
if tag == 'p' and num < 25:
|
||||
remove.add(k)
|
||||
if tag == 'div':
|
||||
if level > 2 and num < 25:
|
||||
remove.add(k)
|
||||
elif level < 3:
|
||||
# Check each level < 3 element and only keep those
|
||||
# that have many child paras
|
||||
for elem in list(v):
|
||||
children = len(paras(elem))
|
||||
if children < 5:
|
||||
v.remove(elem)
|
||||
|
||||
for k in remove:
|
||||
self.levels.pop(k)
|
||||
self.log.debug('Ignoring level', k)
|
||||
|
||||
def analyze_stats(self, stats):
|
||||
if not stats:
|
||||
return False
|
||||
mc = stats.most_common(1)
|
||||
if len(mc) > 1:
|
||||
return False
|
||||
mc = mc[0]
|
||||
most_common, most_common_count = mc
|
||||
if not most_common or most_common == '0':
|
||||
return False
|
||||
total = sum(stats.values())
|
||||
# True if greater than 95% of elements have the same margin
|
||||
return most_common_count/total > 0.95
|
@ -21,7 +21,7 @@ class StructureDetectionWidget(Widget, Ui_Form):
|
||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||
Widget.__init__(self, parent,
|
||||
['chapter', 'chapter_mark',
|
||||
'remove_first_image',
|
||||
'remove_first_image', 'remove_fake_margins',
|
||||
'insert_metadata', 'page_breaks_before']
|
||||
)
|
||||
self.db, self.book_id = db, book_id
|
||||
|
@ -48,10 +48,10 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="6" column="0" colspan="3">
|
||||
<item row="7" column="0" colspan="3">
|
||||
<widget class="XPathEdit" name="opt_page_breaks_before" native="true"/>
|
||||
</item>
|
||||
<item row="7" column="0" colspan="3">
|
||||
<item row="8" column="0" colspan="3">
|
||||
<spacer name="verticalSpacer">
|
||||
<property name="orientation">
|
||||
<enum>Qt::Vertical</enum>
|
||||
@ -77,7 +77,7 @@
|
||||
</property>
|
||||
</spacer>
|
||||
</item>
|
||||
<item row="4" column="0" colspan="3">
|
||||
<item row="5" column="0" colspan="3">
|
||||
<widget class="QLabel" name="label_2">
|
||||
<property name="text">
|
||||
<string>The header and footer removal options have been replaced by the Search & Replace options. Click the Search & Replace category in the bar to the left to use these options. Leave the replace field blank and enter your header/footer removal regexps into the search field.</string>
|
||||
@ -87,6 +87,13 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="2" column="2">
|
||||
<widget class="QCheckBox" name="opt_remove_fake_margins">
|
||||
<property name="text">
|
||||
<string>Remove &fake margins</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
<customwidgets>
|
||||
|
Loading…
x
Reference in New Issue
Block a user