|[iub]>\s*
\s*<[iub]>)\s*(?P[\w\d]+)'% length)
elif format == 'txt':
- intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P((\n|\r|\r\n)(\u0020|\u0009)*)+)(?P[\w\d]+)'% length)
+ intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length)
elif format == 'individual_words':
intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P\w+)\b[^<]*<') # for later, not called anywhere yet
elif format == 'individual_words_txt':
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index f6adb617c3..2e35e8e345 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
- convert_heuristic
+ convert_heuristic, normalize_line_endings
from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin):
@@ -94,13 +94,17 @@ class TXTInput(InputFormatPlugin):
else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
+ # Normalize line endings
+ txt = normalize_line_endings(txt)
+
# Get length for hyphen removal and punctuation unwrap
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
+ print "length is "+str(length)
# Dehyphenate
dehyphenator = Dehyphenator()
- html = dehyphenator(txt,'txt', length)
+ txt = dehyphenator(txt,'txt', length)
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index e26f0a9d07..ebdadebda2 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -80,9 +80,12 @@ def convert_markdown(txt, title='', disable_toc=False):
safe_mode=False)
return HTML_TEMPLATE % (title, md.convert(txt))
-def separate_paragraphs_single_line(txt):
+def normalize_line_endings(txt):
txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n')
+ return txt
+
+def separate_paragraphs_single_line(txt):
txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt)
return txt