diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index df9fd66407..d9d735e391 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -72,8 +72,8 @@ class DocAnalysis(object): def __init__(self, format='html', raw=''): raw = raw.replace(' ', ' ') - raw = raw.replace('\r\n', '\n') - raw = raw.replace('\r', '\n') + #raw = raw.replace('\r\n', '\n') + #raw = raw.replace('\r', '\n') if format == 'html': linere = re.compile('(?<=]*>\s*

).*?(?=

)', re.DOTALL) elif format == 'pdf': @@ -214,10 +214,10 @@ class Dehyphenator(object): else: if self.html.find(lookupword) != -1 or searchresult != -1: - #print "returned dehyphenated word: " + str(dehyphenated) + print "returned dehyphenated word: " + str(dehyphenated) return dehyphenated else: - #print " returned hyphenated word: " + str(hyphenated) + print " returned hyphenated word: " + str(hyphenated) return hyphenated def __call__(self, html, format, length=1): @@ -228,7 +228,7 @@ class Dehyphenator(object): elif format == 'pdf': intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) elif format == 'txt': - intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P((\n|\r|\r\n)(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) elif format == 'individual_words': intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P\w+)\b[^<]*<') # for later, not called anywhere yet elif format == 'individual_words_txt': diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index f6adb617c3..2e35e8e345 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \ - convert_heuristic + convert_heuristic, normalize_line_endings from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -94,13 +94,17 @@ class TXTInput(InputFormatPlugin): else: log.debug('Auto detected paragraph type as %s' % options.paragraph_type) + # Normalize line endings + txt = normalize_line_endings(txt) + # Get length for hyphen removal and punctuation unwrap docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) + print "length is "+str(length) # Dehyphenate dehyphenator = Dehyphenator() - html = dehyphenator(txt,'txt', length) + txt = dehyphenator(txt,'txt', length) # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index e26f0a9d07..ebdadebda2 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -80,9 +80,12 @@ def convert_markdown(txt, title='', disable_toc=False): safe_mode=False) return HTML_TEMPLATE % (title, md.convert(txt)) -def separate_paragraphs_single_line(txt): +def normalize_line_endings(txt): txt = txt.replace('\r\n', '\n') txt = txt.replace('\r', '\n') + return txt + +def separate_paragraphs_single_line(txt): txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt) return txt