diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index ad7f5f117d..f6e259b6f9 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -25,13 +25,15 @@ class HeuristicProcessor(object): self.chapters_with_title = 0 self.blanks_deleted = False self.linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) - self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) + self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) + self.softbreak = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) self.multi_blank = re.compile(r'(\s*]*>\s*

){2,}', re.IGNORECASE) def is_pdftohtml(self, src): return '' in src[:1000] def chapter_head(self, match): + from calibre.utils.html2text import html2text chap = match.group('chap') title = match.group('title') if not title: @@ -40,10 +42,12 @@ class HeuristicProcessor(object): " chapters. - " + unicode(chap)) return '

'+chap+'

\n' else: + txt_chap = html2text(chap) + txt_title = html2text(title) self.html_preprocess_sections = self.html_preprocess_sections + 1 self.log.debug("marked " + unicode(self.html_preprocess_sections) + " chapters & titles. - " + unicode(chap) + ", " + unicode(title)) - return '

'+chap+'

\n

'+title+'

\n' + return '

'+chap+'

\n

'+title+'

\n' def chapter_break(self, match): chap = match.group('section') @@ -203,8 +207,8 @@ class HeuristicProcessor(object): blank_lines = "" opt_title_open = "(" opt_title_close = ")?" - n_lookahead_open = "\s+(?!" - n_lookahead_close = ")" + n_lookahead_open = "(?!\s*" + n_lookahead_close = ")\s*" default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(]*>)?(?=<)" simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(]*>)?(?=<)" @@ -215,7 +219,7 @@ class HeuristicProcessor(object): [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'], [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles [r"]*>\s*(]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)()?\s*", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines - [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters + [r"[^'\"]?(\d+(\.|:))\s*([\w\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon @@ -275,7 +279,7 @@ class HeuristicProcessor(object): self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ") if type_name == 'common': analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name]) - elif self.min_chapters <= hits < max_chapters: + elif self.min_chapters <= hits < max_chapters or self.min_chapters < 3 > hits: analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name]) break else: @@ -367,6 +371,8 @@ class HeuristicProcessor(object): html = re.sub(ur'\s*\s*', ' ', html) # Delete microsoft 'smart' tags html = re.sub('(?i)', '', html) + # Delete self closing paragraph tags + html = re.sub('', '', html) # Get rid of empty span, bold, font, em, & italics tags html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html) html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*\s*){0,2}\s*", " ", html) @@ -467,7 +473,7 @@ class HeuristicProcessor(object): if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False): self.log.debug("deleting blank lines") self.blanks_deleted = True - html = self.multi_blank.sub('\n

', html) + html = self.multi_blank.sub('\n

', html) html = self.blankreg.sub('', html) # Determine line ending type @@ -522,11 +528,11 @@ class HeuristicProcessor(object): # Center separator lines html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•=✦]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) if not self.blanks_deleted: - html = self.multi_blank.sub('\n

', html) - html = re.sub(']*>\s*

', '

', html) + html = self.multi_blank.sub('\n

', html) + html = re.sub(']*>\s*

', '

', html) if self.deleted_nbsps: # put back non-breaking spaces in empty paragraphs to preserve original formatting html = self.blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) - + html = self.softbreak.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) return html diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index 390f288d8e..8018f42b13 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -411,7 +411,7 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None, r.pubdate = pubdate def fix_case(x): - if x and x.isupper(): + if x: x = titlecase(x) return x diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst index 6ec986f26a..7f3ff21fe0 100644 --- a/src/calibre/manual/conversion.rst +++ b/src/calibre/manual/conversion.rst @@ -587,11 +587,11 @@ TXT input supports a number of options to differentiate how paragraphs are detec Assumes that every paragraph starts with an indent (either a tab or 2+ spaces). Paragraphs end when the next line that starts with an indent is reached:: - This is the + This is the first. - This is the second. + This is the second. - This is the + This is the third. :guilabel:`Paragraph Style: Unformatted` @@ -603,7 +603,7 @@ TXT input supports a number of options to differentiate how paragraphs are detec formatting will be applied. :guilabel:`Formatting Style: Heuristic` - Analyses the document for common chapter headings, scene breaks, and italicized words and applies the + Analyzes the document for common chapter headings, scene breaks, and italicized words and applies the appropriate html markup during conversion. :guilabel:`Formatting Style: Markdown`