diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 2ef633d0bb..62a941142b 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -137,7 +137,7 @@ def add_pipeline_options(parser, plumber): 'chapter', 'chapter_mark', 'prefer_metadata_cover', 'remove_first_image', 'insert_metadata', 'page_breaks_before', - 'preprocess_html', + 'preprocess_html', 'html_unwrap_factor', ] ), diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 16282dd28d..3ea2926461 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -362,6 +362,15 @@ OptionRecommendation(name='preprocess_html', ) ), +OptionRecommendation(name='html_unwrap_factor', + recommended_value=0.40, level=OptionRecommendation.LOW, + help=_('Scale used to determine the length at which a line should ' + 'be unwrapped if preprocess is enabled. Valid values are a decimal between 0 and 1. The ' + 'default is 0.40, just below the median line length. This will unwrap typical books ' + ' with hard line breaks, but should be reduced if the line length is variable.' + ) + ), + OptionRecommendation(name='smarten_punctuation', recommended_value=False, level=OptionRecommendation.LOW, help=_('Convert plain quotes, dashes and ellipsis to their ' diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 4538af96c4..e72e15c3d9 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -351,7 +351,7 @@ class HTMLPreProcessor(object): # print "The pdf line length returned is " + str(length) end_rules.append( # Un wrap using punctuation - (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), + (re.compile(r'(?<=.{%i}([a-z,:)\-IA]|(?)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 5301f70a16..3fe6ce0ed4 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -11,10 +11,11 @@ from calibre.utils.logging import default_log class PreProcessor(object): - def __init__(self, log=None): + def __init__(self, log=None, extra_opts=None): self.log = default_log if log is None else log self.html_preprocess_sections = 0 self.found_indents = 0 + self.extra_opts = extra_opts def chapter_head(self, match): chap = match.group('chap') @@ -91,6 +92,7 @@ class PreProcessor(object): # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) blankreg = re.compile(r'\s*]*>\s*(<(b|i|u)>)?\s*()?\s*

', re.IGNORECASE) + #multi_blank = re.compile(r'(\s*]*>\s*(<(b|i|u)>)?\s*()?\s*

){2,}', re.IGNORECASE) blanklines = blankreg.findall(html) lines = linereg.findall(html) if len(lines) > 1: @@ -147,15 +149,16 @@ class PreProcessor(object): format = 'html' # Calculate Length - length = line_length(format, html, 0.4) + length = line_length('pdf', html, getattr(self.extra_opts, + 'html_unwrap_factor', 0.4)) self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***") # # Unwrap and/or delete soft-hyphens, hyphens html = re.sub(u'­\s*(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*', '', html) html = re.sub(u'(?<=[-–—])\s*(?=<)(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html) - # Unwrap lines using punctation if the median length of all lines is less than 200 - unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) + # Unwrap lines using punctation and line length + unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) html = unwrap.sub(' ', html) # If still no sections after unwrapping mark split points on lines with no punctuation diff --git a/src/calibre/ebooks/lrf/input.py b/src/calibre/ebooks/lrf/input.py index 1d730ab573..c54f3b071f 100644 --- a/src/calibre/ebooks/lrf/input.py +++ b/src/calibre/ebooks/lrf/input.py @@ -12,6 +12,7 @@ from copy import deepcopy from lxml import etree from calibre.customize.conversion import InputFormatPlugin +from calibre.ebooks.conversion.utils import PreProcessor from calibre import guess_type class Canvas(etree.XSLTExtension): @@ -419,4 +420,9 @@ class LRFInput(InputFormatPlugin): styles.write() return os.path.abspath('content.opf') + def preprocess_html(self, html): + preprocessor = PreProcessor(log=getattr(self, 'log', None)) + return preprocessor(html) + + diff --git a/src/calibre/gui2/convert/structure_detection.py b/src/calibre/gui2/convert/structure_detection.py index f2ca49d1bd..3f350d4508 100644 --- a/src/calibre/gui2/convert/structure_detection.py +++ b/src/calibre/gui2/convert/structure_detection.py @@ -26,8 +26,10 @@ class StructureDetectionWidget(Widget, Ui_Form): 'remove_first_image', 'insert_metadata', 'page_breaks_before', 'preprocess_html', 'remove_header', 'header_regex', - 'remove_footer', 'footer_regex'] + 'remove_footer', 'footer_regex','html_unwrap_factor'] ) + self.opt_html_unwrap_factor.setEnabled(False) + self.huf_label.setEnabled(False) self.db, self.book_id = db, book_id for x in ('pagebreak', 'rule', 'both', 'none'): self.opt_chapter_mark.addItem(x) @@ -64,3 +66,8 @@ class StructureDetectionWidget(Widget, Ui_Form): _('The XPath expression %s is invalid.')%x.text).exec_() return False return True + + def set_value_handler(self, g, val): + if val is None and g is self.opt_html_unwrap_factor: + g.setValue(0.0) + return True diff --git a/src/calibre/gui2/convert/structure_detection.ui b/src/calibre/gui2/convert/structure_detection.ui index c0b3de3bd9..21fe365e99 100644 --- a/src/calibre/gui2/convert/structure_detection.ui +++ b/src/calibre/gui2/convert/structure_detection.ui @@ -14,10 +14,10 @@ Form - + - + Chapter &mark: @@ -27,31 +27,31 @@ - + 20 - + Remove first &image - + Insert &metadata as page at start of book - + - + Qt::Vertical @@ -64,27 +64,66 @@ - + Remove F&ooter - + Remove H&eader - + - + - + + + + Line &un-wrap factor during preprocess: + + + opt_html_unwrap_factor + + + + + + + + + + 1.000000000000000 + + + 0.050000000000000 + + + 0.400000000000000 + + + + + + + Qt::Horizontal + + + + 40 + 20 + + + + + &Preprocess input file to possibly improve structure detection @@ -108,5 +147,38 @@ - + + + opt_preprocess_html + toggled(bool) + opt_html_unwrap_factor + setEnabled(bool) + + + 328 + 87 + + + 481 + 113 + + + + + opt_preprocess_html + toggled(bool) + huf_label + setEnabled(bool) + + + 295 + 88 + + + 291 + 105 + + + +