mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Conversion: Detect and remove fake page margins that are specified as a margin on every paragraph. This can be turned off via an option under Structure Detection
This commit is contained in:
parent
d12b40a18e
commit
83c8257a14
@ -49,6 +49,8 @@ HEURISTIC_OPTIONS = ['markup_chapter_headings',
|
|||||||
'dehyphenate', 'renumber_headings',
|
'dehyphenate', 'renumber_headings',
|
||||||
'replace_scene_breaks']
|
'replace_scene_breaks']
|
||||||
|
|
||||||
|
DEFAULT_TRUE_OPTIONS = HEURISTIC_OPTIONS + ['remove_fake_margins']
|
||||||
|
|
||||||
def print_help(parser, log):
|
def print_help(parser, log):
|
||||||
help = parser.format_help().encode(preferred_encoding, 'replace')
|
help = parser.format_help().encode(preferred_encoding, 'replace')
|
||||||
log(help)
|
log(help)
|
||||||
@ -90,7 +92,7 @@ def option_recommendation_to_cli_option(add_option, rec):
|
|||||||
if opt.long_switch == 'verbose':
|
if opt.long_switch == 'verbose':
|
||||||
attrs['action'] = 'count'
|
attrs['action'] = 'count'
|
||||||
attrs.pop('type', '')
|
attrs.pop('type', '')
|
||||||
if opt.name in HEURISTIC_OPTIONS and rec.recommended_value is True:
|
if opt.name in DEFAULT_TRUE_OPTIONS and rec.recommended_value is True:
|
||||||
switches = ['--disable-'+opt.long_switch]
|
switches = ['--disable-'+opt.long_switch]
|
||||||
add_option(Option(*switches, **attrs))
|
add_option(Option(*switches, **attrs))
|
||||||
|
|
||||||
@ -162,6 +164,7 @@ def add_pipeline_options(parser, plumber):
|
|||||||
'chapter', 'chapter_mark',
|
'chapter', 'chapter_mark',
|
||||||
'prefer_metadata_cover', 'remove_first_image',
|
'prefer_metadata_cover', 'remove_first_image',
|
||||||
'insert_metadata', 'page_breaks_before',
|
'insert_metadata', 'page_breaks_before',
|
||||||
|
'remove_fake_margins',
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -304,6 +304,17 @@ OptionRecommendation(name='page_breaks_before',
|
|||||||
'before the specified elements.')
|
'before the specified elements.')
|
||||||
),
|
),
|
||||||
|
|
||||||
|
OptionRecommendation(name='remove_fake_margins',
|
||||||
|
recommended_value=True, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Some documents specify page margins by '
|
||||||
|
'specifying a left and right margin on each individual '
|
||||||
|
'paragraph. calibre will try to detect and remove these '
|
||||||
|
'margins. Sometimes, this can cause the removal of '
|
||||||
|
'margins that should not have been removed. In this '
|
||||||
|
'case you can disable the removal.')
|
||||||
|
),
|
||||||
|
|
||||||
|
|
||||||
OptionRecommendation(name='margin_top',
|
OptionRecommendation(name='margin_top',
|
||||||
recommended_value=5.0, level=OptionRecommendation.LOW,
|
recommended_value=5.0, level=OptionRecommendation.LOW,
|
||||||
help=_('Set the top margin in pts. Default is %default. '
|
help=_('Set the top margin in pts. Default is %default. '
|
||||||
@ -988,9 +999,13 @@ OptionRecommendation(name='sr3_replace',
|
|||||||
page_break_on_body=self.output_plugin.file_type in ('mobi',
|
page_break_on_body=self.output_plugin.file_type in ('mobi',
|
||||||
'lit'))
|
'lit'))
|
||||||
flattener(self.oeb, self.opts)
|
flattener(self.oeb, self.opts)
|
||||||
|
|
||||||
self.opts.insert_blank_line = oibl
|
self.opts.insert_blank_line = oibl
|
||||||
self.opts.remove_paragraph_spacing = orps
|
self.opts.remove_paragraph_spacing = orps
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.transforms.page_margin import RemoveFakeMargins
|
||||||
|
RemoveFakeMargins()(self.oeb, self.log, self.opts)
|
||||||
|
|
||||||
pr(0.9)
|
pr(0.9)
|
||||||
self.flush()
|
self.flush()
|
||||||
|
|
||||||
|
153
src/calibre/ebooks/oeb/transforms/page_margin.py
Normal file
153
src/calibre/ebooks/oeb/transforms/page_margin.py
Normal file
@ -0,0 +1,153 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.base import OEB_STYLES, barename, XPath
|
||||||
|
|
||||||
|
class RemoveFakeMargins(object):
|
||||||
|
|
||||||
|
'''
|
||||||
|
Remove left and right margins from paragraph/divs if the same margin is specified
|
||||||
|
on almost all the elements of at that level.
|
||||||
|
|
||||||
|
Must be called only after CSS flattening
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __call__(self, oeb, log, opts):
|
||||||
|
if not opts.remove_fake_margins:
|
||||||
|
return
|
||||||
|
self.oeb, self.log, self.opts = oeb, log, opts
|
||||||
|
stylesheet = None
|
||||||
|
self.levels = {}
|
||||||
|
self.stats = {}
|
||||||
|
self.selector_map = {}
|
||||||
|
|
||||||
|
for item in self.oeb.manifest:
|
||||||
|
if item.media_type.lower() in OEB_STYLES:
|
||||||
|
stylesheet = item
|
||||||
|
break
|
||||||
|
if stylesheet is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.log('Removing fake margins...')
|
||||||
|
|
||||||
|
stylesheet = stylesheet.data
|
||||||
|
|
||||||
|
from cssutils.css import CSSRule
|
||||||
|
for rule in stylesheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
|
||||||
|
self.selector_map[rule.selectorList.selectorText] = rule.style
|
||||||
|
|
||||||
|
self.find_levels()
|
||||||
|
|
||||||
|
for level in self.levels:
|
||||||
|
self.process_level(level)
|
||||||
|
|
||||||
|
def get_margins(self, elem):
|
||||||
|
cls = elem.get('class', None)
|
||||||
|
if cls:
|
||||||
|
style = self.selector_map.get('.'+cls, None)
|
||||||
|
if style:
|
||||||
|
return style.marginLeft, style.marginRight, style
|
||||||
|
return '', '', None
|
||||||
|
|
||||||
|
|
||||||
|
def process_level(self, level):
|
||||||
|
elems = self.levels[level]
|
||||||
|
self.stats[level+'_left'] = Counter()
|
||||||
|
self.stats[level+'_right'] = Counter()
|
||||||
|
|
||||||
|
for elem in elems:
|
||||||
|
lm, rm = self.get_margins(elem)[:2]
|
||||||
|
self.stats[level+'_left'][lm] += 1
|
||||||
|
self.stats[level+'_right'][rm] += 1
|
||||||
|
|
||||||
|
self.log.debug(level, ' left margin stats:', self.stats[level+'_left'])
|
||||||
|
self.log.debug(level, ' right margin stats:', self.stats[level+'_right'])
|
||||||
|
|
||||||
|
remove_left = self.analyze_stats(self.stats[level+'_left'])
|
||||||
|
remove_right = self.analyze_stats(self.stats[level+'_right'])
|
||||||
|
|
||||||
|
|
||||||
|
if remove_left:
|
||||||
|
mcl = self.stats[level+'_left'].most_common(1)[0][0]
|
||||||
|
self.log('Removing level %s left margin of:'%level, mcl)
|
||||||
|
|
||||||
|
if remove_right:
|
||||||
|
mcr = self.stats[level+'_right'].most_common(1)[0][0]
|
||||||
|
self.log('Removing level %s right margin of:'%level, mcr)
|
||||||
|
|
||||||
|
if remove_left or remove_right:
|
||||||
|
for elem in elems:
|
||||||
|
lm, rm, style = self.get_margins(elem)
|
||||||
|
if remove_left and lm == mcl:
|
||||||
|
style.removeProperty('margin-left')
|
||||||
|
if remove_right and rm == mcr:
|
||||||
|
style.removeProperty('margin-right')
|
||||||
|
|
||||||
|
def find_levels(self):
|
||||||
|
|
||||||
|
def level_of(elem, body):
|
||||||
|
ans = 1
|
||||||
|
while elem.getparent() is not body:
|
||||||
|
ans += 1
|
||||||
|
elem = elem.getparent()
|
||||||
|
return ans
|
||||||
|
|
||||||
|
paras = XPath('descendant::h:p|descendant::h:div')
|
||||||
|
|
||||||
|
for item in self.oeb.spine:
|
||||||
|
body = XPath('//h:body')(item.data)
|
||||||
|
if not body:
|
||||||
|
continue
|
||||||
|
body = body[0]
|
||||||
|
|
||||||
|
for p in paras(body):
|
||||||
|
level = level_of(p, body)
|
||||||
|
level = '%s_%d'%(barename(p.tag), level)
|
||||||
|
if level not in self.levels:
|
||||||
|
self.levels[level] = []
|
||||||
|
self.levels[level].append(p)
|
||||||
|
|
||||||
|
remove = set()
|
||||||
|
for k, v in self.levels.iteritems():
|
||||||
|
num = len(v)
|
||||||
|
self.log.debug('Found %d items of level:'%num, k)
|
||||||
|
level = int(k.split('_')[-1])
|
||||||
|
tag = k.split('_')[0]
|
||||||
|
if tag == 'p' and num < 25:
|
||||||
|
remove.add(k)
|
||||||
|
if tag == 'div':
|
||||||
|
if level > 2 and num < 25:
|
||||||
|
remove.add(k)
|
||||||
|
elif level < 3:
|
||||||
|
# Check each level < 3 element and only keep those
|
||||||
|
# that have many child paras
|
||||||
|
for elem in list(v):
|
||||||
|
children = len(paras(elem))
|
||||||
|
if children < 5:
|
||||||
|
v.remove(elem)
|
||||||
|
|
||||||
|
for k in remove:
|
||||||
|
self.levels.pop(k)
|
||||||
|
self.log.debug('Ignoring level', k)
|
||||||
|
|
||||||
|
def analyze_stats(self, stats):
|
||||||
|
if not stats:
|
||||||
|
return False
|
||||||
|
mc = stats.most_common(1)
|
||||||
|
if len(mc) > 1:
|
||||||
|
return False
|
||||||
|
mc = mc[0]
|
||||||
|
most_common, most_common_count = mc
|
||||||
|
if not most_common or most_common == '0':
|
||||||
|
return False
|
||||||
|
total = sum(stats.values())
|
||||||
|
# True if greater than 95% of elements have the same margin
|
||||||
|
return most_common_count/total > 0.95
|
@ -21,7 +21,7 @@ class StructureDetectionWidget(Widget, Ui_Form):
|
|||||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||||
Widget.__init__(self, parent,
|
Widget.__init__(self, parent,
|
||||||
['chapter', 'chapter_mark',
|
['chapter', 'chapter_mark',
|
||||||
'remove_first_image',
|
'remove_first_image', 'remove_fake_margins',
|
||||||
'insert_metadata', 'page_breaks_before']
|
'insert_metadata', 'page_breaks_before']
|
||||||
)
|
)
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
|
@ -48,10 +48,10 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="6" column="0" colspan="3">
|
<item row="7" column="0" colspan="3">
|
||||||
<widget class="XPathEdit" name="opt_page_breaks_before" native="true"/>
|
<widget class="XPathEdit" name="opt_page_breaks_before" native="true"/>
|
||||||
</item>
|
</item>
|
||||||
<item row="7" column="0" colspan="3">
|
<item row="8" column="0" colspan="3">
|
||||||
<spacer name="verticalSpacer">
|
<spacer name="verticalSpacer">
|
||||||
<property name="orientation">
|
<property name="orientation">
|
||||||
<enum>Qt::Vertical</enum>
|
<enum>Qt::Vertical</enum>
|
||||||
@ -77,7 +77,7 @@
|
|||||||
</property>
|
</property>
|
||||||
</spacer>
|
</spacer>
|
||||||
</item>
|
</item>
|
||||||
<item row="4" column="0" colspan="3">
|
<item row="5" column="0" colspan="3">
|
||||||
<widget class="QLabel" name="label_2">
|
<widget class="QLabel" name="label_2">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>The header and footer removal options have been replaced by the Search & Replace options. Click the Search & Replace category in the bar to the left to use these options. Leave the replace field blank and enter your header/footer removal regexps into the search field.</string>
|
<string>The header and footer removal options have been replaced by the Search & Replace options. Click the Search & Replace category in the bar to the left to use these options. Leave the replace field blank and enter your header/footer removal regexps into the search field.</string>
|
||||||
@ -87,6 +87,13 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
|
<item row="2" column="2">
|
||||||
|
<widget class="QCheckBox" name="opt_remove_fake_margins">
|
||||||
|
<property name="text">
|
||||||
|
<string>Remove &fake margins</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
</layout>
|
</layout>
|
||||||
</widget>
|
</widget>
|
||||||
<customwidgets>
|
<customwidgets>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user