diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 5db920b01d..27dacdf5fb 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -184,7 +184,22 @@ class PreProcessor(object): self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters") return html - + def punctuation_unwrap(self, length, content, format): + # define the pieces of the regex + lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?\s*()?" + blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*" + line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" + txt_line_wrap = u"(\u0020|\u0009)*\n" + + unwrap_regex = lookahead+line_ending+blanklines+line_opening + if format == 'txt': + unwrap_regex = lookahead+txt_line_wrap + + unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE) + content = unwrap.sub(' ', content) + return content + def __call__(self, html): self.log("********* Preprocessing HTML *********") @@ -312,8 +327,7 @@ class PreProcessor(object): self.log("Done dehyphenating") # Unwrap lines using punctation and line length #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE) - unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) - html = unwrap.sub(' ', html) + html = self.punctuation_unwrap(length, html, 'html') #check any remaining hyphens, but only unwrap if there is a match dehyphenator = Dehyphenator() html = dehyphenator(html,'html_cleanup', length) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 7fb22755de..98756c5fa1 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -95,6 +95,16 @@ class TXTInput(InputFormatPlugin): elif options.paragraph_type == 'print': txt = separate_paragraphs_print_formatted(txt) + if options.paragraph_type == 'unformatted': + from calibre.ebooks.conversion.utils import PreProcessor + from calibre.ebooks.conversion.preprocess import DocAnalysis + # get length + docanalysis = DocAnalysis('txt', txt) + length = docanalysis.line_length(.5) + # unwrap lines based on punctuation + preprocessor = PreProcessor(options, log=getattr(self, 'log', None)) + txt = preprocessor.punctuation_unwrap(length, txt, 'txt') + flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size)