diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index ae111355e4..08a46cb8d9 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -79,7 +79,7 @@ class DocAnalysis(object): elif format == 'spanned_html': linere = re.compile('(?<=)', re.DOTALL) elif format == 'txt': - linere = re.compile('.*?\n', re.DOTALL) + linere = re.compile('.*?\n') self.lines = linere.findall(raw) def line_length(self, percent): @@ -177,7 +177,7 @@ class Dehyphenator(object): def __init__(self): # Add common suffixes to the regex below to increase the likelihood of a match - # don't add suffixes which are also complete words, such as 'able' or 'sex' - self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE) + self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE) # remove prefixes if the prefix was not already the point of hyphenation self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE) self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE) @@ -199,7 +199,7 @@ class Dehyphenator(object): searchresult = self.html.find(lookupword.lower()) except: return hyphenated - if self.format == 'html_cleanup': + if self.format == 'html_cleanup' or self.format == 'txt_cleanup': if self.html.find(lookupword) != -1 or searchresult != -1: #print "Cleanup:returned dehyphenated word: " + str(dehyphenated) return dehyphenated @@ -225,10 +225,15 @@ class Dehyphenator(object): intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P()?\s*(\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*

\s*)?\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length) elif format == 'pdf': intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) + elif format == 'txt': + intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) elif format == 'individual_words': - intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P\w+)\b[^<]*<') # for later, not called anywhere yet elif format == 'html_cleanup': intextmatch = re.compile(u'(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') + elif format == 'txt_cleanup': + intextmatch = re.compile(u'(?P\w+)(-|‐)(?P\s+)(?P[\w\d]+)') + html = intextmatch.sub(self.dehyphenate, html) return html diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 27dacdf5fb..52d1bcc619 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -190,7 +190,7 @@ class PreProcessor(object): line_ending = "\s*\s*()?" blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*" line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" - txt_line_wrap = u"(\u0020|\u0009)*\n" + txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}" unwrap_regex = lookahead+line_ending+blanklines+line_opening if format == 'txt': @@ -357,6 +357,6 @@ class PreProcessor(object): html = blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) # Center separator lines - html = re.sub(u'

\s*(?P([*#•]+\s*)+)\s*

', '

' + '\g' + '

', html) + html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) return html diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index e782cd0cd9..3957391494 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -7,11 +7,12 @@ __docformat__ = 'restructuredtext en' import os from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \ - convert_heuristic + convert_heuristic, normalize_line_endings from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -23,7 +24,7 @@ class TXTInput(InputFormatPlugin): options = set([ OptionRecommendation(name='paragraph_type', recommended_value='auto', - choices=['auto', 'block', 'single', 'print'], + choices=['auto', 'block', 'single', 'print', 'unformatted'], help=_('Paragraph structure.\n' 'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n' '* auto: Try to auto detect paragraph type.\n' @@ -31,7 +32,7 @@ class TXTInput(InputFormatPlugin): '* single: Assume every line is a paragraph.\n' '* print: Assume every line starting with 2+ spaces or a tab ' 'starts a paragraph.' - '* unformatted: Most lines have hard line breaks, few/no spaces or indents.')), + '* unformatted: Most lines have hard line breaks, few/no blank lines or indents.')), OptionRecommendation(name='formatting_type', recommended_value='auto', choices=['auto', 'none', 'heuristic', 'markdown'], help=_('Formatting used within the document.' @@ -72,6 +73,13 @@ class TXTInput(InputFormatPlugin): # followed by the   entity. if options.preserve_spaces: txt = preserve_spaces(txt) + + # Normalize line endings + txt = normalize_line_endings(txt) + + # Get length for hyphen removal and punctuation unwrap + docanalysis = DocAnalysis('txt', txt) + length = docanalysis.line_length(.5) if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) @@ -91,10 +99,15 @@ class TXTInput(InputFormatPlugin): log.debug('Could not reliably determine paragraph type using block') options.paragraph_type = 'block' else: - log.debug('Auto detected paragraph type as %s' % options.paragraph_type) - + log.debug('Auto detected paragraph type as %s' % options.paragraph_type) + + # Dehyphenate + dehyphenator = Dehyphenator() + txt = dehyphenator(txt,'txt', length) + # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. + if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': @@ -102,10 +115,8 @@ class TXTInput(InputFormatPlugin): if options.paragraph_type == 'unformatted': from calibre.ebooks.conversion.utils import PreProcessor - from calibre.ebooks.conversion.preprocess import DocAnalysis # get length - docanalysis = DocAnalysis('txt', txt) - length = docanalysis.line_length(.5) + # unwrap lines based on punctuation preprocessor = PreProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') @@ -116,7 +127,11 @@ class TXTInput(InputFormatPlugin): html = convert_heuristic(txt, epub_split_size_kb=flow_size) else: html = convert_basic(txt, epub_split_size_kb=flow_size) - + + # Dehyphenate in cleanup mode for missed txt and markdown conversion + dehyphenator = Dehyphenator() + html = dehyphenator(html,'txt_cleanup', length) + html = dehyphenator(html,'html_cleanup', length) from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 9dc29e45dd..6a1a106681 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -80,9 +80,12 @@ def convert_markdown(txt, title='', disable_toc=False): safe_mode=False) return HTML_TEMPLATE % (title, md.convert(txt)) -def separate_paragraphs_single_line(txt): +def normalize_line_endings(txt): txt = txt.replace('\r\n', '\n') txt = txt.replace('\r', '\n') + return txt + +def separate_paragraphs_single_line(txt): txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt) return txt @@ -117,7 +120,7 @@ def detect_paragraph_type(txt): single: Each line is a paragraph. print: Each paragraph starts with a 2+ spaces or a tab and ends when a new paragraph is reached. - unformatted: most lines have hard line breaks, few/no spaces or indents + unformatted: most lines have hard line breaks, few/no blank lines or indents returns block, single, print, unformatted ''' @@ -130,15 +133,21 @@ def detect_paragraph_type(txt): hardbreaks = docanalysis.line_histogram(.55) if hardbreaks: - # Check for print + # Determine print percentage tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) - if tab_line_count / float(txt_line_count) >= .15: - return 'print' - - # Check for block + print_percent = tab_line_count / float(txt_line_count) + + # Determine block percentage empty_line_count = len(re.findall('(?mu)^\s*$', txt)) - if empty_line_count / float(txt_line_count) >= .15: - return 'block' + block_percent = empty_line_count / float(txt_line_count) + + # Compare the two types - the type with the larger number of instances wins + # in cases where only one or the other represents the vast majority of the document neither wins + if print_percent >= block_percent: + if .15 <= print_percent <= .75: + return 'print' + elif .15 <= block_percent <= .75: + return 'block' # Assume unformatted text with hardbreaks if nothing else matches return 'unformatted'