diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index ac38a0097d..fffb0d75d4 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -195,7 +195,7 @@ class PreProcessor(object): numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*" uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*" numeric_titles = r".?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*" - emphasized_lines = r"]*>\s*(]*>)?\s*(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)()?\s*" + emphasized_lines = r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)()?\s*" full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) @@ -308,10 +308,10 @@ class PreProcessor(object): html = re.sub(u'\xad\s*(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*', '', html) # If still no sections after unwrapping mark split points on lines with no punctuation - if self.html_preprocess_sections < 10: + if self.html_preprocess_sections < 5: self.log("Looking for more split points based on punctuation," " currently have " + unicode(self.html_preprocess_sections)) - chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P
(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?(){0,2}\s*()?\s*(){0,2}\s*()?\s*)', re.IGNORECASE) + chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P
(]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?(){0,2}\s*()?\s*(){0,2}\s*()?\s*)', re.IGNORECASE) html = chapdetect3.sub(self.chapter_break, html) # search for places where a first or second level heading is immediately followed by another # top level heading. demote the second heading to h3 to prevent splitting between chapter