From da679b885faf07c3218946d072b529259c5e6955 Mon Sep 17 00:00:00 2001 From: ldolse Date: Fri, 7 Jan 2011 11:26:45 +0800 Subject: [PATCH 1/5] chapter heading tweaks --- src/calibre/ebooks/conversion/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 4bb96ac088..2090cff12d 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -153,7 +153,7 @@ class PreProcessor(object): default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(]*>)?(?=<)" chapter_types = [ - [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], + [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"], [r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)()?\s*", True, "Searching for emphasized lines"], # Emphasized lines [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering From dd96c645f020cd57682bbeba8501c21b8b77b0b9 Mon Sep 17 00:00:00 2001 From: ldolse Date: Fri, 7 Jan 2011 14:19:12 +0800 Subject: [PATCH 2/5] tied line histogram into txt paragraph structure detection --- src/calibre/ebooks/conversion/preprocess.py | 2 ++ src/calibre/ebooks/txt/input.py | 2 +- src/calibre/ebooks/txt/processor.py | 31 ++++++++++++++------- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 29006ffd9b..97aaa653a9 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -78,6 +78,8 @@ class DocAnalysis(object): linere = re.compile('(?<=
)(?!\s*
).*?(?=
)', re.DOTALL) elif format == 'spanned_html': linere = re.compile('(?<=)', re.DOTALL) + elif format == 'txt': + linere = re.compile('.*?\n', re.DOTALL) self.lines = linere.findall(raw) def line_length(self, percent): diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 47e92a45a9..7fb22755de 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -90,7 +90,7 @@ class TXTInput(InputFormatPlugin): # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. - if options.paragraph_type == 'single': + if options.paragraph_type == 'single' or 'unformatted': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_paragraphs_print_formatted(txt) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index f6d628e7c5..53935584d2 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -9,6 +9,7 @@ import os, re from calibre import prepare_string_for_xml, isbytestring from calibre.ebooks.markdown import markdown from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.conversion.preprocess import DocAnalysis __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' @@ -102,26 +103,36 @@ def detect_paragraph_type(txt): print: Each paragraph starts with a 2+ spaces or a tab and ends when a new paragraph is reached. markdown: Markdown formatting is in the document. + unformatted: most lines have hard line breaks, few/no spaces or indents - returns block, single, print, markdown + returns block, single, print, markdown, unformatted ''' txt = txt.replace('\r\n', '\n') txt = txt.replace('\r', '\n') txt_line_count = len(re.findall('(?mu)^\s*.+$', txt)) - # Check for print - tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) - if tab_line_count / float(txt_line_count) >= .25: - return 'print' + # Check for hard line breaks - true if 55% of the doc breaks in the same region + docanalysis = DocAnalysis('txt', txt) + hardbreaks = docanalysis.line_histogram(.55) - # Check for block - empty_line_count = len(re.findall('(?mu)^\s*$', txt)) - if empty_line_count / float(txt_line_count) >= .25: - return 'block' + if hardbreaks: + # Check for print + tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) + if tab_line_count / float(txt_line_count) >= .25: + return 'print' + + # Check for block + empty_line_count = len(re.findall('(?mu)^\s*$', txt)) + if empty_line_count / float(txt_line_count) >= .25: + return 'block' + + # Assume unformatted text with hardbreaks if nothing else matches + return 'unformatted' - # Nothing else matched to assume single. + # return single if hardbreaks is false return 'single' + def detect_formatting_type(txt): # Check for markdown # Headings From 90177a42053f29c302faf7483de6dd3fc455d400 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 8 Jan 2011 09:23:32 +0800 Subject: [PATCH 3/5] tweaked threshold for preprocess --- src/calibre/ebooks/conversion/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index f367aa02d7..5db920b01d 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -194,7 +194,7 @@ class PreProcessor(object): totalwords = 0 totalwords = self.get_word_count(html) - if totalwords < 20: + if totalwords < 50: self.log("not enough text, not preprocessing") return html From 5854f5308e46d1be747cf85d789d9ca9de78e80b Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 8 Jan 2011 19:45:51 +0800 Subject: [PATCH 4/5] moved punctuation unwrap into a function, tied to txt input --- src/calibre/ebooks/conversion/utils.py | 20 +++++++++++++++++--- src/calibre/ebooks/txt/input.py | 10 ++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 5db920b01d..27dacdf5fb 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -184,7 +184,22 @@ class PreProcessor(object): self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters") return html - + def punctuation_unwrap(self, length, content, format): + # define the pieces of the regex + lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?\s*()?" + blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*" + line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" + txt_line_wrap = u"(\u0020|\u0009)*\n" + + unwrap_regex = lookahead+line_ending+blanklines+line_opening + if format == 'txt': + unwrap_regex = lookahead+txt_line_wrap + + unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE) + content = unwrap.sub(' ', content) + return content + def __call__(self, html): self.log("********* Preprocessing HTML *********") @@ -312,8 +327,7 @@ class PreProcessor(object): self.log("Done dehyphenating") # Unwrap lines using punctation and line length #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE) - unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) - html = unwrap.sub(' ', html) + html = self.punctuation_unwrap(length, html, 'html') #check any remaining hyphens, but only unwrap if there is a match dehyphenator = Dehyphenator() html = dehyphenator(html,'html_cleanup', length) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 7fb22755de..98756c5fa1 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -95,6 +95,16 @@ class TXTInput(InputFormatPlugin): elif options.paragraph_type == 'print': txt = separate_paragraphs_print_formatted(txt) + if options.paragraph_type == 'unformatted': + from calibre.ebooks.conversion.utils import PreProcessor + from calibre.ebooks.conversion.preprocess import DocAnalysis + # get length + docanalysis = DocAnalysis('txt', txt) + length = docanalysis.line_length(.5) + # unwrap lines based on punctuation + preprocessor = PreProcessor(options, log=getattr(self, 'log', None)) + txt = preprocessor.punctuation_unwrap(length, txt, 'txt') + flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) From f88045c16266474ed625a0e38b0a9fa12aded75d Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 8 Jan 2011 20:35:19 +0800 Subject: [PATCH 5/5] fixed comments --- src/calibre/ebooks/txt/processor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 53935584d2..c6cf1078cd 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -102,10 +102,9 @@ def detect_paragraph_type(txt): single: Each line is a paragraph. print: Each paragraph starts with a 2+ spaces or a tab and ends when a new paragraph is reached. - markdown: Markdown formatting is in the document. unformatted: most lines have hard line breaks, few/no spaces or indents - returns block, single, print, markdown, unformatted + returns block, single, print, unformatted ''' txt = txt.replace('\r\n', '\n') txt = txt.replace('\r', '\n')