added dehyphenation to txt input

2025-08-30 23:00:21 -04:00 · 2011-01-09 17:27:24 +08:00 · 2011-01-09 17:27:24 +08:00 · f3a9f3f83f
commit f3a9f3f83f
parent 289cdf3392
2 changed files with 23 additions and 10 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -72,6 +72,8 @@ class DocAnalysis(object):

    def __init__(self, format='html', raw=''):
        raw = raw.replace('&nbsp;', ' ')
+        raw = raw.replace('\r\n', '\n')
+        raw = raw.replace('\r', '\n')
        if format == 'html':
            linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
        elif format == 'pdf':
@ -79,7 +81,7 @@ class DocAnalysis(object):
        elif format == 'spanned_html':
            linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
        elif format == 'txt':
-            linere = re.compile('.*?\n', re.DOTALL)
+            linere = re.compile('.*?\n')
        self.lines = linere.findall(raw)

    def line_length(self, percent):
@ -177,7 +179,7 @@ class Dehyphenator(object):
    def __init__(self):
        # Add common suffixes to the regex below to increase the likelihood of a match -
        # don't add suffixes which are also complete words, such as 'able' or 'sex'
-        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
        # remove prefixes if the prefix was not already the point of hyphenation
        self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
        self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
@ -194,7 +196,7 @@ class Dehyphenator(object):
        lookupword = self.removesuffixes.sub('', dehyphenated)
        if self.prefixes.match(firsthalf) is None:
            lookupword = self.removeprefix.sub('', lookupword)
-        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+        print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
        try:
            searchresult = self.html.find(lookupword.lower())
        except:
@ -225,8 +227,13 @@ class Dehyphenator(object):
            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length)
        elif format == 'pdf':
            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
+        elif format == 'txt':
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>((\n|\r|\r\n)(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
        elif format == 'individual_words':
-            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
+            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
+        elif format == 'individual_words_txt':
+            intextmatch = re.compile(u'\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b')
+
        elif format == 'html_cleanup':
            intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')

--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en'
 import os

 from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
    separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
@ -93,6 +94,14 @@ class TXTInput(InputFormatPlugin):
                else:
                    log.debug('Auto detected paragraph type as %s' % options.paragraph_type)

+            # Get length for hyphen removal and punctuation unwrap
+            docanalysis = DocAnalysis('txt', txt)
+            length = docanalysis.line_length(.5)
+
+            # Dehyphenate
+            dehyphenator = Dehyphenator()
+            html = dehyphenator(txt,'txt', length)
+
            # We don't check for block because the processor assumes block.
            # single and print at transformed to block for processing.

@ -103,10 +112,8 @@ class TXTInput(InputFormatPlugin):

            if options.paragraph_type == 'unformatted':
                from calibre.ebooks.conversion.utils import PreProcessor
-                from calibre.ebooks.conversion.preprocess import DocAnalysis
                # get length
-                docanalysis = DocAnalysis('txt', txt)
-                length = docanalysis.line_length(.5)
+
                # unwrap lines based on punctuation
                preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
                txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
@ -118,7 +125,6 @@ class TXTInput(InputFormatPlugin):
            else:
                html = convert_basic(txt, epub_split_size_kb=flow_size)

-
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options: