...

2026-01-03 10:40:21 -05:00 · 2011-01-09 10:15:53 -07:00 · 2011-01-09 10:15:53 -07:00 · 26e8ec2fd0
commit 26e8ec2fd0
parent 60d5f0f04d 7008e9b64c
4 changed files with 53 additions and 24 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -79,7 +79,7 @@ class DocAnalysis(object):
        elif format == 'spanned_html':
            linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
        elif format == 'txt':
-            linere = re.compile('.*?\n', re.DOTALL)
+            linere = re.compile('.*?\n')
        self.lines = linere.findall(raw)

    def line_length(self, percent):
@ -177,7 +177,7 @@ class Dehyphenator(object):
    def __init__(self):
        # Add common suffixes to the regex below to increase the likelihood of a match -
        # don't add suffixes which are also complete words, such as 'able' or 'sex'
-        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
        # remove prefixes if the prefix was not already the point of hyphenation
        self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
        self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
@ -199,7 +199,7 @@ class Dehyphenator(object):
            searchresult = self.html.find(lookupword.lower())
        except:
            return hyphenated
-        if self.format == 'html_cleanup':
+        if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
            if self.html.find(lookupword) != -1 or searchresult != -1:
                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
                return dehyphenated
@ -225,10 +225,15 @@ class Dehyphenator(object):
            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length)
        elif format == 'pdf':
            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
+        elif format == 'txt':
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
        elif format == 'individual_words':
-            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
+            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
        elif format == 'html_cleanup':
            intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
+        elif format == 'txt_cleanup':
+            intextmatch = re.compile(u'(?P<firstpart>\w+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
+

        html = intextmatch.sub(self.dehyphenate, html)
        return html
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -190,7 +190,7 @@ class PreProcessor(object):
        line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
        blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
        line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
-        txt_line_wrap = u"(\u0020|\u0009)*\n"
+        txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
        
        unwrap_regex = lookahead+line_ending+blanklines+line_opening
        if format == 'txt':
@ -357,6 +357,6 @@ class PreProcessor(object):
        html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)

        # Center separator lines
-        html = re.sub(u'<p>\s*(?P<break>([*#•]+\s*)+)\s*</p>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
+        html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)

        return html
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -7,11 +7,12 @@ __docformat__ = 'restructuredtext en'
 import os

 from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
    separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
    preserve_spaces, detect_paragraph_type, detect_formatting_type, \
-    convert_heuristic
+    convert_heuristic, normalize_line_endings
 from calibre import _ent_pat, xml_entity_to_unicode

 class TXTInput(InputFormatPlugin):
@ -23,7 +24,7 @@ class TXTInput(InputFormatPlugin):

    options = set([
        OptionRecommendation(name='paragraph_type', recommended_value='auto',
-            choices=['auto', 'block', 'single', 'print'],
+            choices=['auto', 'block', 'single', 'print', 'unformatted'],
            help=_('Paragraph structure.\n'
                   'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n'
                   '* auto: Try to auto detect paragraph type.\n'
@ -31,7 +32,7 @@ class TXTInput(InputFormatPlugin):
                   '* single: Assume every line is a paragraph.\n'
                   '* print:  Assume every line starting with 2+ spaces or a tab '
                   'starts a paragraph.'
-                   '* unformatted: Most lines have hard line breaks, few/no spaces or indents.')),
+                   '* unformatted: Most lines have hard line breaks, few/no blank lines or indents.')),
        OptionRecommendation(name='formatting_type', recommended_value='auto',
            choices=['auto', 'none', 'heuristic', 'markdown'],
            help=_('Formatting used within the document.'
@ -72,6 +73,13 @@ class TXTInput(InputFormatPlugin):
        # followed by the &nbsp; entity.
        if options.preserve_spaces:
            txt = preserve_spaces(txt)
+
+        # Normalize line endings
+        txt = normalize_line_endings(txt)
+
+        # Get length for hyphen removal and punctuation unwrap
+        docanalysis = DocAnalysis('txt', txt)
+        length = docanalysis.line_length(.5)
            
        if options.formatting_type == 'auto':
            options.formatting_type = detect_formatting_type(txt)
@ -91,10 +99,15 @@ class TXTInput(InputFormatPlugin):
                    log.debug('Could not reliably determine paragraph type using block')
                    options.paragraph_type = 'block'
                else:
-                    log.debug('Auto detected paragraph type as %s' % options.paragraph_type) 
-            
+                    log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
+
+            # Dehyphenate
+            dehyphenator = Dehyphenator()
+            txt = dehyphenator(txt,'txt', length)
+
            # We don't check for block because the processor assumes block.
            # single and print at transformed to block for processing.
+
            if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted':
                txt = separate_paragraphs_single_line(txt)
            elif options.paragraph_type == 'print':
@ -102,10 +115,8 @@ class TXTInput(InputFormatPlugin):

            if options.paragraph_type == 'unformatted':
                from calibre.ebooks.conversion.utils import PreProcessor
-                from calibre.ebooks.conversion.preprocess import DocAnalysis
                # get length
-                docanalysis = DocAnalysis('txt', txt)
-                length = docanalysis.line_length(.5)
+
                # unwrap lines based on punctuation
                preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
                txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
@ -116,7 +127,11 @@ class TXTInput(InputFormatPlugin):
                html = convert_heuristic(txt, epub_split_size_kb=flow_size)
            else:
                html = convert_basic(txt, epub_split_size_kb=flow_size)
-            
+
+        # Dehyphenate in cleanup mode for missed txt and markdown conversion
+        dehyphenator = Dehyphenator()
+        html = dehyphenator(html,'txt_cleanup', length)
+        html = dehyphenator(html,'html_cleanup', length)

        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -80,9 +80,12 @@ def convert_markdown(txt, title='', disable_toc=False):
          safe_mode=False)
    return HTML_TEMPLATE % (title, md.convert(txt))

-def separate_paragraphs_single_line(txt):
+def normalize_line_endings(txt):
    txt = txt.replace('\r\n', '\n')
    txt = txt.replace('\r', '\n')
+    return txt
+
+def separate_paragraphs_single_line(txt):
    txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt)
    return txt

@ -117,7 +120,7 @@ def detect_paragraph_type(txt):
    single: Each line is a paragraph.
    print: Each paragraph starts with a 2+ spaces or a tab
           and ends when a new paragraph is reached.
-    unformatted: most lines have hard line breaks, few/no spaces or indents
+    unformatted: most lines have hard line breaks, few/no blank lines or indents
    
    returns block, single, print, unformatted
    '''
@ -130,15 +133,21 @@ def detect_paragraph_type(txt):
    hardbreaks = docanalysis.line_histogram(.55)
    
    if hardbreaks:
-        # Check for print
+        # Determine print percentage
        tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
-        if tab_line_count / float(txt_line_count) >= .15:
-            return 'print'
-        
-        # Check for block
+        print_percent = tab_line_count / float(txt_line_count)
+     
+        # Determine block percentage
        empty_line_count = len(re.findall('(?mu)^\s*$', txt))
-        if empty_line_count / float(txt_line_count) >= .15:
-            return 'block'
+        block_percent = empty_line_count / float(txt_line_count)
+        
+        # Compare the two types - the type with the larger number of instances wins
+        # in cases where only one or the other represents the vast majority of the document neither wins
+        if print_percent >= block_percent:
+            if .15 <= print_percent <= .75:
+                return 'print'
+        elif .15 <= block_percent <= .75:
+            return 'block'     

        # Assume unformatted text with hardbreaks if nothing else matches        
        return 'unformatted'