diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index ae111355e4..df9fd66407 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -72,6 +72,8 @@ class DocAnalysis(object): def __init__(self, format='html', raw=''): raw = raw.replace(' ', ' ') + raw = raw.replace('\r\n', '\n') + raw = raw.replace('\r', '\n') if format == 'html': linere = re.compile('(?<=]*>\s*

).*?(?=

)', re.DOTALL) elif format == 'pdf': @@ -79,7 +81,7 @@ class DocAnalysis(object): elif format == 'spanned_html': linere = re.compile('(?<=)', re.DOTALL) elif format == 'txt': - linere = re.compile('.*?\n', re.DOTALL) + linere = re.compile('.*?\n') self.lines = linere.findall(raw) def line_length(self, percent): @@ -177,7 +179,7 @@ class Dehyphenator(object): def __init__(self): # Add common suffixes to the regex below to increase the likelihood of a match - # don't add suffixes which are also complete words, such as 'able' or 'sex' - self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE) + self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE) # remove prefixes if the prefix was not already the point of hyphenation self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE) self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE) @@ -194,7 +196,7 @@ class Dehyphenator(object): lookupword = self.removesuffixes.sub('', dehyphenated) if self.prefixes.match(firsthalf) is None: lookupword = self.removeprefix.sub('', lookupword) - #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) + print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) try: searchresult = self.html.find(lookupword.lower()) except: @@ -225,8 +227,13 @@ class Dehyphenator(object): intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P()?\s*(\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*

\s*)?\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length) elif format == 'pdf': intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) + elif format == 'txt': + intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P((\n|\r|\r\n)(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) elif format == 'individual_words': - intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P\w+)\b[^<]*<') # for later, not called anywhere yet + elif format == 'individual_words_txt': + intextmatch = re.compile(u'\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P\w+)\b') + elif format == 'html_cleanup': intextmatch = re.compile(u'(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 9bc9323a4c..f6adb617c3 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en' import os from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ @@ -91,8 +92,16 @@ class TXTInput(InputFormatPlugin): log.debug('Could not reliably determine paragraph type using block') options.paragraph_type = 'block' else: - log.debug('Auto detected paragraph type as %s' % options.paragraph_type) - + log.debug('Auto detected paragraph type as %s' % options.paragraph_type) + + # Get length for hyphen removal and punctuation unwrap + docanalysis = DocAnalysis('txt', txt) + length = docanalysis.line_length(.5) + + # Dehyphenate + dehyphenator = Dehyphenator() + html = dehyphenator(txt,'txt', length) + # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. @@ -103,10 +112,8 @@ class TXTInput(InputFormatPlugin): if options.paragraph_type == 'unformatted': from calibre.ebooks.conversion.utils import PreProcessor - from calibre.ebooks.conversion.preprocess import DocAnalysis # get length - docanalysis = DocAnalysis('txt', txt) - length = docanalysis.line_length(.5) + # unwrap lines based on punctuation preprocessor = PreProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') @@ -117,7 +124,6 @@ class TXTInput(InputFormatPlugin): html = convert_heuristic(txt, epub_split_size_kb=flow_size) else: html = convert_basic(txt, epub_split_size_kb=flow_size) - from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html')