normalized line endings to simplify line length and dehyphenation, fixes print formatted output for certain line endings

2026-05-27 17:22:34 -04:00 · 2011-01-09 18:14:49 +08:00
parent f3a9f3f83f
commit 696d925232
3 changed files with 15 additions and 8 deletions
@@ -72,8 +72,8 @@ class DocAnalysis(object):

    def __init__(self, format='html', raw=''):
        raw = raw.replace('&nbsp;', ' ')
-        raw = raw.replace('\r\n', '\n')
-        raw = raw.replace('\r', '\n')
+        #raw = raw.replace('\r\n', '\n')
+        #raw = raw.replace('\r', '\n')
        if format == 'html':
            linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
        elif format == 'pdf':
@@ -214,10 +214,10 @@ class Dehyphenator(object):

        else:
            if self.html.find(lookupword) != -1 or searchresult != -1:
-                #print "returned dehyphenated word: " + str(dehyphenated)
+                print "returned dehyphenated word: " + str(dehyphenated)
                return dehyphenated
            else:
-                #print "           returned hyphenated word: " + str(hyphenated)
+                print "           returned hyphenated word: " + str(hyphenated)
                return hyphenated

    def __call__(self, html, format, length=1):
@@ -228,7 +228,7 @@ class Dehyphenator(object):
        elif format == 'pdf':
            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
        elif format == 'txt':
-            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>((\n|\r|\r\n)(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
        elif format == 'individual_words':
            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
        elif format == 'individual_words_txt':
@@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
    separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
    preserve_spaces, detect_paragraph_type, detect_formatting_type, \
-    convert_heuristic
+    convert_heuristic, normalize_line_endings
 from calibre import _ent_pat, xml_entity_to_unicode

 class TXTInput(InputFormatPlugin):
@@ -94,13 +94,17 @@ class TXTInput(InputFormatPlugin):
                else:
                    log.debug('Auto detected paragraph type as %s' % options.paragraph_type)

+            # Normalize line endings
+            txt = normalize_line_endings(txt)
+
            # Get length for hyphen removal and punctuation unwrap
            docanalysis = DocAnalysis('txt', txt)
            length = docanalysis.line_length(.5)
+            print "length is "+str(length)

            # Dehyphenate
            dehyphenator = Dehyphenator()
-            html = dehyphenator(txt,'txt', length)
+            txt = dehyphenator(txt,'txt', length)

            # We don't check for block because the processor assumes block.
            # single and print at transformed to block for processing.
@@ -80,9 +80,12 @@ def convert_markdown(txt, title='', disable_toc=False):
          safe_mode=False)
    return HTML_TEMPLATE % (title, md.convert(txt))

-def separate_paragraphs_single_line(txt):
+def normalize_line_endings(txt):
    txt = txt.replace('\r\n', '\n')
    txt = txt.replace('\r', '\n')
+    return txt
+
+def separate_paragraphs_single_line(txt):
    txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt)
    return txt