adding html_unwrap_factor

2025-12-09 22:55:02 -05:00 · 2010-09-16 16:48:59 +08:00 · 2010-09-16 16:48:59 +08:00 · 8cac314ffe
commit 8cac314ffe
parent 569b84e1cb
5 changed files with 67 additions and 20 deletions
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@ -137,7 +137,7 @@ def add_pipeline_options(parser, plumber):
                      'chapter', 'chapter_mark',
                      'prefer_metadata_cover', 'remove_first_image',
                      'insert_metadata', 'page_breaks_before',
-                      'preprocess_html',
+                      'preprocess_html', 'html_unwrap_factor',
                  ]
                  ),

--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -362,6 +362,15 @@ OptionRecommendation(name='preprocess_html',
            )
        ),

+OptionRecommendation(name='html_unwrap_factor',
+        recommended_value=0.40, level=OptionRecommendation.LOW,
+        help=_('Scale used to determine the length at which a line should '
+            'be unwrapped if preprocess is enabled. Valid values are a decimal between 0 and 1. The '
+            'default is 0.40, just below the median line length.  This will unwrap typical books '
+            ' with hard line breaks, but should be reduced if the line length is variable.'
+            )
+        ),
+
 OptionRecommendation(name='smarten_punctuation',
        recommended_value=False, level=OptionRecommendation.LOW,
        help=_('Convert plain quotes, dashes and ellipsis to their '
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -11,10 +11,11 @@ from calibre.utils.logging import default_log

 class PreProcessor(object):

-    def __init__(self, log=None):
+    def __init__(self, log=None, extra_opts=None):
        self.log = default_log if log is None else log
        self.html_preprocess_sections = 0
        self.found_indents = 0
+        self.extra_opts = extra_opts

    def chapter_head(self, match):
        chap = match.group('chap')
@ -91,6 +92,7 @@ class PreProcessor(object):
        # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
        linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
        blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
+        multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
        blanklines = blankreg.findall(html)
        lines = linereg.findall(html)
        if len(lines) > 1:
@ -147,15 +149,19 @@ class PreProcessor(object):
            format = 'html'

        # Calculate Length        
-        length = line_length(format, html, 0.4)
+        #if getattr(self.extra_opts, 'html_unwrap_factor', 0.0) > 0.01:
+        length = line_length('pdf', html, getattr(self.extra_opts, 'html_unwrap_factor'))
+        #else:
+        #    length = line_length(format, html, 0.4)
+        #    self.log("#@#%!$@#$ - didn't find unwrap_factor")
        self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
        #
        # Unwrap and/or delete soft-hyphens, hyphens
        html = re.sub(u'\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
        html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)

-        # Unwrap lines using punctation if the median length of all lines is less than 200
-        unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+        # Unwrap lines using punctation and line length
+        unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
        html = unwrap.sub(' ', html)

        # If still no sections after unwrapping mark split points on lines with no punctuation
--- a/src/calibre/gui2/convert/structure_detection.py
+++ b/src/calibre/gui2/convert/structure_detection.py
@ -26,7 +26,7 @@ class StructureDetectionWidget(Widget, Ui_Form):
                'remove_first_image',
                'insert_metadata', 'page_breaks_before',
                'preprocess_html', 'remove_header', 'header_regex',
-                'remove_footer', 'footer_regex']
+                'remove_footer', 'footer_regex','html_unwrap_factor']
                )
        self.db, self.book_id = db, book_id
        for x in ('pagebreak', 'rule', 'both', 'none'):
@ -64,3 +64,8 @@ class StructureDetectionWidget(Widget, Ui_Form):
                _('The XPath expression %s is invalid.')%x.text).exec_()
                return False
        return True
+
+    def set_value_handler(self, g, val):
+        if val is None and isinstance(g, QDoubleSpinBox):
+            g.setValue(0.0)
+            return True
--- a/src/calibre/gui2/convert/structure_detection.ui
+++ b/src/calibre/gui2/convert/structure_detection.ui
@ -48,17 +48,7 @@
     </property>
    </widget>
   </item>
-   <item row="8" column="0" colspan="2">
-    <widget class="QCheckBox" name="opt_preprocess_html">
-     <property name="text">
-      <string>&amp;Preprocess input file to possibly improve structure detection</string>
-     </property>
-    </widget>
-   </item>
-   <item row="9" column="0" colspan="2">
-    <widget class="XPathEdit" name="opt_page_breaks_before" native="true"/>
-   </item>
-   <item row="10" column="0" colspan="2">
+   <item row="14" column="0" colspan="2">
    <spacer name="verticalSpacer">
     <property name="orientation">
      <enum>Qt::Vertical</enum>
@ -88,8 +78,45 @@
   <item row="5" column="0" colspan="2">
    <widget class="RegexEdit" name="opt_header_regex" native="true"/>
   </item>
-   <item row="7" column="0" colspan="2">
-    <widget class="RegexEdit" name="opt_footer_regex" native="true"/>
+   <item row="8" column="0" colspan="2">
+    <widget class="RegexEdit" name="opt_footer_regex" native="true">
+     <zorder>opt_page_breaks_before</zorder>
+    </widget>
+   </item>
+   <item row="10" column="0" colspan="2">
+    <widget class="XPathEdit" name="opt_page_breaks_before" native="true">
+     <zorder>opt_footer_regex</zorder>
+    </widget>
+   </item>
+   <item row="11" column="0">
+    <widget class="QCheckBox" name="opt_preprocess_html">
+     <property name="text">
+      <string>&amp;Preprocess input file to possibly improve structure detection</string>
+     </property>
+    </widget>
+   </item>
+   <item row="12" column="0">
+    <widget class="QLabel" name="label_2">
+     <property name="layoutDirection">
+      <enum>Qt::RightToLeft</enum>
+     </property>
+     <property name="text">
+      <string>Line Un-Wrapping Factor</string>
+     </property>
+    </widget>
+   </item>
+   <item row="12" column="1">
+    <widget class="QDoubleSpinBox" name="opt_html_unwrap_factor">
+     <property name="maximum">
+      <double>1.000000000000000</double>
+     </property>
+     <property name="singleStep">
+      <double>0.050000000000000</double>
+     </property>
+     <property name="value">
+      <double>0.400000000000000</double>
+     </property>
+    </widget>
   </item>
  </layout>
 </widget>