added partial dehyphenation for markdown

2025-07-09 03:04:10 -04:00 · 2011-01-09 19:34:02 +08:00 · 2011-01-09 19:34:02 +08:00 · c2cef786ce
commit c2cef786ce
parent 0f109d699f
2 changed files with 22 additions and 16 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -201,15 +201,15 @@ class Dehyphenator(object):
            searchresult = self.html.find(lookupword.lower())
        except:
            return hyphenated
-        if self.format == 'html_cleanup':
+        if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
            if self.html.find(lookupword) != -1 or searchresult != -1:
-                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+                print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
                return dehyphenated
            elif self.html.find(hyphenated) != -1:
-                #print "Cleanup:returned hyphenated word: " + str(hyphenated)
+                print "Cleanup:returned hyphenated word: " + str(hyphenated)
                return hyphenated
            else:
-                #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+                print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
                return firsthalf+u'\u2014'+wraptags+secondhalf
        else:
@ -230,12 +230,12 @@ class Dehyphenator(object):
        elif format == 'txt':
            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
        elif format == 'individual_words':
-            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
+            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
        elif format == 'individual_words_txt':
            intextmatch = re.compile(u'\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b')
        elif format == 'html_cleanup':
            intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
        elif format == 'txt_cleanup':
            intextmatch = re.compile(u'(?P<firstpart>\w+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
        html = intextmatch.sub(self.dehyphenate, html)
        return html
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -74,6 +74,14 @@ class TXTInput(InputFormatPlugin):
        if options.preserve_spaces:
            txt = preserve_spaces(txt)
        # Normalize line endings
        txt = normalize_line_endings(txt)
        # Get length for hyphen removal and punctuation unwrap
        docanalysis = DocAnalysis('txt', txt)
        length = docanalysis.line_length(.5)
        print "length is "+str(length)
        if options.formatting_type == 'auto':
            options.formatting_type = detect_formatting_type(txt)
@ -94,14 +102,6 @@ class TXTInput(InputFormatPlugin):
                else:
                    log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
            # Normalize line endings
            txt = normalize_line_endings(txt)
            # Get length for hyphen removal and punctuation unwrap
            docanalysis = DocAnalysis('txt', txt)
            length = docanalysis.line_length(.5)
            print "length is "+str(length)
            # Dehyphenate
            dehyphenator = Dehyphenator()
            txt = dehyphenator(txt,'txt', length)
@ -129,6 +129,12 @@ class TXTInput(InputFormatPlugin):
            else:
                html = convert_basic(txt, epub_split_size_kb=flow_size)
        # Dehyphenate in cleanup mode for missed txt and markdown conversion
        print "going through final dehyphenation"
        dehyphenator = Dehyphenator()
        html = dehyphenator(html,'txt_cleanup', length)
        html = dehyphenator(html,'html_cleanup', length)
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options: