From 6e64f5ec4e0cb66cc0bf722ecf050f72fb9120dc Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 24 Jan 2011 12:20:50 +0800 Subject: [PATCH 1/6] doc tweaks, delete empty paragraphs during Heuristics --- src/calibre/ebooks/conversion/utils.py | 2 ++ src/calibre/manual/conversion.rst | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index aabb1b8bc4..14eca46b07 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -367,6 +367,8 @@ class HeuristicProcessor(object): html = re.sub(ur'\s*\s*', ' ', html) # Delete microsoft 'smart' tags html = re.sub('(?i)', '', html) + # Delete self closing paragraph tags + html = re.sub('', '', html) # Get rid of empty span, bold, font, em, & italics tags html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html) html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*\s*){0,2}\s*", " ", html) diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst index 6ec986f26a..7f3ff21fe0 100644 --- a/src/calibre/manual/conversion.rst +++ b/src/calibre/manual/conversion.rst @@ -587,11 +587,11 @@ TXT input supports a number of options to differentiate how paragraphs are detec Assumes that every paragraph starts with an indent (either a tab or 2+ spaces). Paragraphs end when the next line that starts with an indent is reached:: - This is the + This is the first. - This is the second. + This is the second. - This is the + This is the third. :guilabel:`Paragraph Style: Unformatted` @@ -603,7 +603,7 @@ TXT input supports a number of options to differentiate how paragraphs are detec formatting will be applied. :guilabel:`Formatting Style: Heuristic` - Analyses the document for common chapter headings, scene breaks, and italicized words and applies the + Analyzes the document for common chapter headings, scene breaks, and italicized words and applies the appropriate html markup during conversion. :guilabel:`Formatting Style: Markdown` From 1e675af91aa665334261dafdc50f89724d2bf4f4 Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 24 Jan 2011 12:22:42 +0800 Subject: [PATCH 2/6] Always apply title case to downloaded metadata - tired of crap metadata sources with 99% lowercase characters --- src/calibre/ebooks/metadata/fetch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index 390f288d8e..8018f42b13 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -411,7 +411,7 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None, r.pubdate = pubdate def fix_case(x): - if x and x.isupper(): + if x: x = titlecase(x) return x From a20cd3336dc5a309543dd5134a4f85bf563b9b09 Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 24 Jan 2011 13:47:53 +0800 Subject: [PATCH 3/6] improved negative lookaheads in heuristics --- src/calibre/ebooks/conversion/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index e8c2acb9d6..22639801ff 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -203,8 +203,8 @@ class HeuristicProcessor(object): blank_lines = "" opt_title_open = "(" opt_title_close = ")?" - n_lookahead_open = "\s+(?!" - n_lookahead_close = ")" + n_lookahead_open = "(?!\s*" + n_lookahead_close = ")\s*" default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(]*>)?(?=<)" simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(]*>)?(?=<)" From d03ae9e001165d6edc82fbdbf884b89cd0b35ed9 Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 24 Jan 2011 14:20:48 +0800 Subject: [PATCH 4/6] fixed a regression in chapter markup logic which was causing false negatives --- src/calibre/ebooks/conversion/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 22639801ff..1eb063cdd8 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -215,7 +215,7 @@ class HeuristicProcessor(object): [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'], [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles [r"]*>\s*(]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)()?\s*", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines - [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters + [r"[^'\"]?(\d+(\.|:))\s*([\w\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon @@ -275,7 +275,7 @@ class HeuristicProcessor(object): self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ") if type_name == 'common': analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name]) - elif self.min_chapters <= hits < max_chapters: + elif self.min_chapters <= hits < max_chapters or self.min_chapters < 3 > hits: analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name]) break else: From 670fc644edb235b4a1fed74ded7924a2b8c13ec4 Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 24 Jan 2011 16:00:05 +0800 Subject: [PATCH 5/6] made chapter markup routine more Sigil friendly --- src/calibre/ebooks/conversion/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 1eb063cdd8..d9350e6adb 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -43,7 +43,7 @@ class HeuristicProcessor(object): self.html_preprocess_sections = self.html_preprocess_sections + 1 self.log.debug("marked " + unicode(self.html_preprocess_sections) + " chapters & titles. - " + unicode(chap) + ", " + unicode(title)) - return '

'+chap+'

\n

'+title+'

\n' + return '

'+chap+'

\n

'+title+'

\n' def chapter_break(self, match): chap = match.group('section') From 7485c9a5e200c20285ead3795025781f4d9ef31e Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 24 Jan 2011 16:30:29 +0800 Subject: [PATCH 6/6] fixed sigil integration to strip html from chapter titles, fixed softbreak handling and enabled integration with extra_css --- src/calibre/ebooks/conversion/utils.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index d9350e6adb..f6e259b6f9 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -25,13 +25,15 @@ class HeuristicProcessor(object): self.chapters_with_title = 0 self.blanks_deleted = False self.linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) - self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) + self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) + self.softbreak = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) self.multi_blank = re.compile(r'(\s*]*>\s*

){2,}', re.IGNORECASE) def is_pdftohtml(self, src): return '' in src[:1000] def chapter_head(self, match): + from calibre.utils.html2text import html2text chap = match.group('chap') title = match.group('title') if not title: @@ -40,10 +42,12 @@ class HeuristicProcessor(object): " chapters. - " + unicode(chap)) return '

'+chap+'

\n' else: + txt_chap = html2text(chap) + txt_title = html2text(title) self.html_preprocess_sections = self.html_preprocess_sections + 1 self.log.debug("marked " + unicode(self.html_preprocess_sections) + " chapters & titles. - " + unicode(chap) + ", " + unicode(title)) - return '

'+chap+'

\n

'+title+'

\n' + return '

'+chap+'

\n

'+title+'

\n' def chapter_break(self, match): chap = match.group('section') @@ -469,7 +473,7 @@ class HeuristicProcessor(object): if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False): self.log.debug("deleting blank lines") self.blanks_deleted = True - html = self.multi_blank.sub('\n

', html) + html = self.multi_blank.sub('\n

', html) html = self.blankreg.sub('', html) # Determine line ending type @@ -524,11 +528,11 @@ class HeuristicProcessor(object): # Center separator lines html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•=✦]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) if not self.blanks_deleted: - html = self.multi_blank.sub('\n

', html) - html = re.sub(']*>\s*

', '

', html) + html = self.multi_blank.sub('\n

', html) + html = re.sub(']*>\s*

', '

', html) if self.deleted_nbsps: # put back non-breaking spaces in empty paragraphs to preserve original formatting html = self.blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) - + html = self.softbreak.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) return html