diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index ad7f5f117d..f6e259b6f9 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -25,13 +25,15 @@ class HeuristicProcessor(object): self.chapters_with_title = 0 self.blanks_deleted = False self.linereg = re.compile('(?<=
)', re.IGNORECASE|re.DOTALL)
- self.blankreg = re.compile(r'\s*(?P ]*>)\s*(?P ]*>)\s*(?P ]*>)\s*(?P ]*>\s* ' + '\g ]*>\s* ]*>\s*'+chap+'
\n'
else:
+ txt_chap = html2text(chap)
+ txt_title = html2text(title)
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log.debug("marked " + unicode(self.html_preprocess_sections) +
" chapters & titles. - " + unicode(chap) + ", " + unicode(title))
- return ''+chap+'
\n'+title+'
\n'
+ return ''+chap+'
\n'+title+'
\n'
def chapter_break(self, match):
chap = match.group('section')
@@ -203,8 +207,8 @@ class HeuristicProcessor(object):
blank_lines = ""
opt_title_open = "("
opt_title_close = ")?"
- n_lookahead_open = "\s+(?!"
- n_lookahead_close = ")"
+ n_lookahead_open = "(?!\s*"
+ n_lookahead_close = ")\s*"
default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?([ibu][^>]*>)?(?=<)"
simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?([ibu][^>]*>)?(?=<)"
@@ -215,7 +219,7 @@ class HeuristicProcessor(object):
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles
[r"]*>\s*(]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)()?\s*", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
- [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters
+ [r"[^'\"]?(\d+(\.|:))\s*([\w\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
[r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon
@@ -275,7 +279,7 @@ class HeuristicProcessor(object):
self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
if type_name == 'common':
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
- elif self.min_chapters <= hits < max_chapters:
+ elif self.min_chapters <= hits < max_chapters or self.min_chapters < 3 > hits:
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
break
else:
@@ -367,6 +371,8 @@ class HeuristicProcessor(object):
html = re.sub(ur'\s*