mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Conversion pipeline: Add option to control line unwrapping to preprocessing to handle input documents with hard line breaks. See Structure Detection in the conversion options
This commit is contained in:
commit
d4e8775e44
@ -137,7 +137,7 @@ def add_pipeline_options(parser, plumber):
|
|||||||
'chapter', 'chapter_mark',
|
'chapter', 'chapter_mark',
|
||||||
'prefer_metadata_cover', 'remove_first_image',
|
'prefer_metadata_cover', 'remove_first_image',
|
||||||
'insert_metadata', 'page_breaks_before',
|
'insert_metadata', 'page_breaks_before',
|
||||||
'preprocess_html',
|
'preprocess_html', 'html_unwrap_factor',
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -362,6 +362,15 @@ OptionRecommendation(name='preprocess_html',
|
|||||||
)
|
)
|
||||||
),
|
),
|
||||||
|
|
||||||
|
OptionRecommendation(name='html_unwrap_factor',
|
||||||
|
recommended_value=0.40, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Scale used to determine the length at which a line should '
|
||||||
|
'be unwrapped if preprocess is enabled. Valid values are a decimal between 0 and 1. The '
|
||||||
|
'default is 0.40, just below the median line length. This will unwrap typical books '
|
||||||
|
' with hard line breaks, but should be reduced if the line length is variable.'
|
||||||
|
)
|
||||||
|
),
|
||||||
|
|
||||||
OptionRecommendation(name='smarten_punctuation',
|
OptionRecommendation(name='smarten_punctuation',
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
help=_('Convert plain quotes, dashes and ellipsis to their '
|
help=_('Convert plain quotes, dashes and ellipsis to their '
|
||||||
|
@ -351,7 +351,7 @@ class HTMLPreProcessor(object):
|
|||||||
# print "The pdf line length returned is " + str(length)
|
# print "The pdf line length returned is " + str(length)
|
||||||
end_rules.append(
|
end_rules.append(
|
||||||
# Un wrap using punctuation
|
# Un wrap using punctuation
|
||||||
(re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
(re.compile(r'(?<=.{%i}([a-z,:)\-IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||||
)
|
)
|
||||||
|
|
||||||
for rule in self.PREPROCESS + start_rules:
|
for rule in self.PREPROCESS + start_rules:
|
||||||
|
@ -11,10 +11,11 @@ from calibre.utils.logging import default_log
|
|||||||
|
|
||||||
class PreProcessor(object):
|
class PreProcessor(object):
|
||||||
|
|
||||||
def __init__(self, log=None):
|
def __init__(self, log=None, extra_opts=None):
|
||||||
self.log = default_log if log is None else log
|
self.log = default_log if log is None else log
|
||||||
self.html_preprocess_sections = 0
|
self.html_preprocess_sections = 0
|
||||||
self.found_indents = 0
|
self.found_indents = 0
|
||||||
|
self.extra_opts = extra_opts
|
||||||
|
|
||||||
def chapter_head(self, match):
|
def chapter_head(self, match):
|
||||||
chap = match.group('chap')
|
chap = match.group('chap')
|
||||||
@ -91,6 +92,7 @@ class PreProcessor(object):
|
|||||||
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
|
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
|
||||||
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||||
blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
|
blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
|
||||||
|
#multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
|
||||||
blanklines = blankreg.findall(html)
|
blanklines = blankreg.findall(html)
|
||||||
lines = linereg.findall(html)
|
lines = linereg.findall(html)
|
||||||
if len(lines) > 1:
|
if len(lines) > 1:
|
||||||
@ -147,15 +149,16 @@ class PreProcessor(object):
|
|||||||
format = 'html'
|
format = 'html'
|
||||||
|
|
||||||
# Calculate Length
|
# Calculate Length
|
||||||
length = line_length(format, html, 0.4)
|
length = line_length('pdf', html, getattr(self.extra_opts,
|
||||||
|
'html_unwrap_factor', 0.4))
|
||||||
self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
|
self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
|
||||||
#
|
#
|
||||||
# Unwrap and/or delete soft-hyphens, hyphens
|
# Unwrap and/or delete soft-hyphens, hyphens
|
||||||
html = re.sub(u'\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
html = re.sub(u'\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
||||||
html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
|
html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
|
||||||
|
|
||||||
# Unwrap lines using punctation if the median length of all lines is less than 200
|
# Unwrap lines using punctation and line length
|
||||||
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||||
html = unwrap.sub(' ', html)
|
html = unwrap.sub(' ', html)
|
||||||
|
|
||||||
# If still no sections after unwrapping mark split points on lines with no punctuation
|
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||||
|
@ -12,6 +12,7 @@ from copy import deepcopy
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
|
from calibre.ebooks.conversion.utils import PreProcessor
|
||||||
from calibre import guess_type
|
from calibre import guess_type
|
||||||
|
|
||||||
class Canvas(etree.XSLTExtension):
|
class Canvas(etree.XSLTExtension):
|
||||||
@ -419,4 +420,9 @@ class LRFInput(InputFormatPlugin):
|
|||||||
styles.write()
|
styles.write()
|
||||||
return os.path.abspath('content.opf')
|
return os.path.abspath('content.opf')
|
||||||
|
|
||||||
|
def preprocess_html(self, html):
|
||||||
|
preprocessor = PreProcessor(log=getattr(self, 'log', None))
|
||||||
|
return preprocessor(html)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -26,8 +26,10 @@ class StructureDetectionWidget(Widget, Ui_Form):
|
|||||||
'remove_first_image',
|
'remove_first_image',
|
||||||
'insert_metadata', 'page_breaks_before',
|
'insert_metadata', 'page_breaks_before',
|
||||||
'preprocess_html', 'remove_header', 'header_regex',
|
'preprocess_html', 'remove_header', 'header_regex',
|
||||||
'remove_footer', 'footer_regex']
|
'remove_footer', 'footer_regex','html_unwrap_factor']
|
||||||
)
|
)
|
||||||
|
self.opt_html_unwrap_factor.setEnabled(False)
|
||||||
|
self.huf_label.setEnabled(False)
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
for x in ('pagebreak', 'rule', 'both', 'none'):
|
for x in ('pagebreak', 'rule', 'both', 'none'):
|
||||||
self.opt_chapter_mark.addItem(x)
|
self.opt_chapter_mark.addItem(x)
|
||||||
@ -64,3 +66,8 @@ class StructureDetectionWidget(Widget, Ui_Form):
|
|||||||
_('The XPath expression %s is invalid.')%x.text).exec_()
|
_('The XPath expression %s is invalid.')%x.text).exec_()
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def set_value_handler(self, g, val):
|
||||||
|
if val is None and g is self.opt_html_unwrap_factor:
|
||||||
|
g.setValue(0.0)
|
||||||
|
return True
|
||||||
|
@ -14,10 +14,10 @@
|
|||||||
<string>Form</string>
|
<string>Form</string>
|
||||||
</property>
|
</property>
|
||||||
<layout class="QGridLayout" name="gridLayout">
|
<layout class="QGridLayout" name="gridLayout">
|
||||||
<item row="0" column="0" colspan="2">
|
<item row="0" column="1" colspan="2">
|
||||||
<widget class="XPathEdit" name="opt_chapter" native="true"/>
|
<widget class="XPathEdit" name="opt_chapter" native="true"/>
|
||||||
</item>
|
</item>
|
||||||
<item row="1" column="0">
|
<item row="1" column="0" colspan="2">
|
||||||
<widget class="QLabel" name="label">
|
<widget class="QLabel" name="label">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>Chapter &mark:</string>
|
<string>Chapter &mark:</string>
|
||||||
@ -27,31 +27,31 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="1" column="1">
|
<item row="1" column="2">
|
||||||
<widget class="QComboBox" name="opt_chapter_mark">
|
<widget class="QComboBox" name="opt_chapter_mark">
|
||||||
<property name="minimumContentsLength">
|
<property name="minimumContentsLength">
|
||||||
<number>20</number>
|
<number>20</number>
|
||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="2" column="0">
|
<item row="2" column="0" colspan="2">
|
||||||
<widget class="QCheckBox" name="opt_remove_first_image">
|
<widget class="QCheckBox" name="opt_remove_first_image">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>Remove first &image</string>
|
<string>Remove first &image</string>
|
||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="4" column="0">
|
<item row="5" column="0" colspan="2">
|
||||||
<widget class="QCheckBox" name="opt_insert_metadata">
|
<widget class="QCheckBox" name="opt_insert_metadata">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>Insert &metadata as page at start of book</string>
|
<string>Insert &metadata as page at start of book</string>
|
||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="10" column="0" colspan="2">
|
<item row="11" column="0" colspan="3">
|
||||||
<widget class="XPathEdit" name="opt_page_breaks_before" native="true"/>
|
<widget class="XPathEdit" name="opt_page_breaks_before" native="true"/>
|
||||||
</item>
|
</item>
|
||||||
<item row="11" column="0" colspan="2">
|
<item row="12" column="0" colspan="3">
|
||||||
<spacer name="verticalSpacer">
|
<spacer name="verticalSpacer">
|
||||||
<property name="orientation">
|
<property name="orientation">
|
||||||
<enum>Qt::Vertical</enum>
|
<enum>Qt::Vertical</enum>
|
||||||
@ -64,27 +64,66 @@
|
|||||||
</property>
|
</property>
|
||||||
</spacer>
|
</spacer>
|
||||||
</item>
|
</item>
|
||||||
<item row="7" column="0">
|
<item row="8" column="0" colspan="2">
|
||||||
<widget class="QCheckBox" name="opt_remove_footer">
|
<widget class="QCheckBox" name="opt_remove_footer">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>Remove F&ooter</string>
|
<string>Remove F&ooter</string>
|
||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="5" column="0">
|
<item row="6" column="0" colspan="2">
|
||||||
<widget class="QCheckBox" name="opt_remove_header">
|
<widget class="QCheckBox" name="opt_remove_header">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>Remove H&eader</string>
|
<string>Remove H&eader</string>
|
||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="6" column="0" colspan="2">
|
<item row="7" column="0" colspan="3">
|
||||||
<widget class="RegexEdit" name="opt_header_regex" native="true"/>
|
<widget class="RegexEdit" name="opt_header_regex" native="true"/>
|
||||||
</item>
|
</item>
|
||||||
<item row="8" column="0" colspan="2">
|
<item row="9" column="0" colspan="3">
|
||||||
<widget class="RegexEdit" name="opt_footer_regex" native="true"/>
|
<widget class="RegexEdit" name="opt_footer_regex" native="true"/>
|
||||||
</item>
|
</item>
|
||||||
<item row="3" column="0">
|
<item row="4" column="1">
|
||||||
|
<widget class="QLabel" name="huf_label">
|
||||||
|
<property name="text">
|
||||||
|
<string>Line &un-wrap factor during preprocess:</string>
|
||||||
|
</property>
|
||||||
|
<property name="buddy">
|
||||||
|
<cstring>opt_html_unwrap_factor</cstring>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="4" column="2">
|
||||||
|
<widget class="QDoubleSpinBox" name="opt_html_unwrap_factor">
|
||||||
|
<property name="toolTip">
|
||||||
|
<string/>
|
||||||
|
</property>
|
||||||
|
<property name="maximum">
|
||||||
|
<double>1.000000000000000</double>
|
||||||
|
</property>
|
||||||
|
<property name="singleStep">
|
||||||
|
<double>0.050000000000000</double>
|
||||||
|
</property>
|
||||||
|
<property name="value">
|
||||||
|
<double>0.400000000000000</double>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="4" column="0">
|
||||||
|
<spacer name="horizontalSpacer">
|
||||||
|
<property name="orientation">
|
||||||
|
<enum>Qt::Horizontal</enum>
|
||||||
|
</property>
|
||||||
|
<property name="sizeHint" stdset="0">
|
||||||
|
<size>
|
||||||
|
<width>40</width>
|
||||||
|
<height>20</height>
|
||||||
|
</size>
|
||||||
|
</property>
|
||||||
|
</spacer>
|
||||||
|
</item>
|
||||||
|
<item row="3" column="0" colspan="2">
|
||||||
<widget class="QCheckBox" name="opt_preprocess_html">
|
<widget class="QCheckBox" name="opt_preprocess_html">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>&Preprocess input file to possibly improve structure detection</string>
|
<string>&Preprocess input file to possibly improve structure detection</string>
|
||||||
@ -108,5 +147,38 @@
|
|||||||
</customwidget>
|
</customwidget>
|
||||||
</customwidgets>
|
</customwidgets>
|
||||||
<resources/>
|
<resources/>
|
||||||
<connections/>
|
<connections>
|
||||||
|
<connection>
|
||||||
|
<sender>opt_preprocess_html</sender>
|
||||||
|
<signal>toggled(bool)</signal>
|
||||||
|
<receiver>opt_html_unwrap_factor</receiver>
|
||||||
|
<slot>setEnabled(bool)</slot>
|
||||||
|
<hints>
|
||||||
|
<hint type="sourcelabel">
|
||||||
|
<x>328</x>
|
||||||
|
<y>87</y>
|
||||||
|
</hint>
|
||||||
|
<hint type="destinationlabel">
|
||||||
|
<x>481</x>
|
||||||
|
<y>113</y>
|
||||||
|
</hint>
|
||||||
|
</hints>
|
||||||
|
</connection>
|
||||||
|
<connection>
|
||||||
|
<sender>opt_preprocess_html</sender>
|
||||||
|
<signal>toggled(bool)</signal>
|
||||||
|
<receiver>huf_label</receiver>
|
||||||
|
<slot>setEnabled(bool)</slot>
|
||||||
|
<hints>
|
||||||
|
<hint type="sourcelabel">
|
||||||
|
<x>295</x>
|
||||||
|
<y>88</y>
|
||||||
|
</hint>
|
||||||
|
<hint type="destinationlabel">
|
||||||
|
<x>291</x>
|
||||||
|
<y>105</y>
|
||||||
|
</hint>
|
||||||
|
</hints>
|
||||||
|
</connection>
|
||||||
|
</connections>
|
||||||
</ui>
|
</ui>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user