implemented multi-pass analysis for chapter detection

2025-07-09 03:04:10 -04:00 · 2011-01-17 23:53:27 +08:00 · 2011-01-17 23:53:27 +08:00 · a0aa719bb0
commit a0aa719bb0
parent 05730e1886
1 changed files with 68 additions and 25 deletions
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -21,6 +21,7 @@ class HeuristicProcessor(object):
        self.deleted_nbsps = False
        self.totalwords = 0
        self.min_chapters = 1
        self.max_chapters = 150
        self.chapters_no_title = 0
        self.chapters_with_title = 0
        self.blanks_deleted = False
@ -132,7 +133,7 @@ class HeuristicProcessor(object):
    def markup_italicis(self, html):
        ITALICIZE_WORDS = [
            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
-            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
+            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
            'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
            'Mlle.', 'Mons.', 'PS.', 'PPS.',
        ]
@ -166,9 +167,11 @@ class HeuristicProcessor(object):
        with minimum false positives.  Exits after finding a successful pattern
        '''
        # Typical chapters are between 2000 and 7000 words, use the larger number to decide the
-        # minimum of chapters to search for
+        # minimum of chapters to search for.  A max limit is calculated to prevent things like OCR
        # or pdf page numbers from being treated as TOC markers
        if wordcount > 7000:
            self.min_chapters = int(ceil(wordcount / 7000.))
            self.max_chapters = int(ceil(wordcount / 100.))
        #print "minimum chapters required are: "+str(self.min_chapters)
        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
        self.html_preprocess_sections = len(heading.findall(html))
@ -202,44 +205,84 @@ class HeuristicProcessor(object):
        n_lookahead_open = "\s+(?!"
        n_lookahead_close = ")"
-        default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
+        default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
        simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter).{0,50}?(</[ibu][^>]*>)?(?=<)"
        analysis_result = []
        chapter_types = [
-            [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for most common chapter headings", 'chapter'],  # Highest frequency headings which include titles
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
-            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common section headings", 'common'],
+            [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],  # Highest frequency headings which include titles
-            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
+            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
-            [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
+            [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
-            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
+            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
-            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
+            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
-            [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric headings", 'plain_number'],  # Numeric Chapters, no dot or colon
+            [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for simple numeric headings", 'plain_number'],  # Numeric Chapters, no dot or colon
-            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
+            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
            ]
        def recurse_patterns(html, analyze):
            # Start with most typical chapter headings, get more aggressive until one works
-            for [chapter_type, lookahead_ignorecase, log_message, type_name] in chapter_types:
+            for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
                n_lookahead = ''
                hits = 0
                self.chapters_no_title = 0
                self.chapters_with_title = 0
                if n_lookahead_req:
                    lp_n_lookahead_open = n_lookahead_open
                    lp_n_lookahead_close = n_lookahead_close
                else:
                    lp_n_lookahead_open = ''
                    lp_n_lookahead_close = ''
                if strict_title:
                    lp_title = default_title
                else:
                    lp_title = simple_title
                if ignorecase:
                    arg_ignorecase = r'(?i)'
                else:
                    arg_ignorecase = ''
                if title_req:
                    lp_opt_title_open = ''
                    lp_opt_title_close = ''        
                else:
                    lp_opt_title_open = opt_title_open
                    lp_opt_title_close = opt_title_close
                if self.html_preprocess_sections >= self.min_chapters:
                    break
                full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
                if n_lookahead_req:
                    n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
                if not analyze:
                    self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
-                if lookahead_ignorecase:
+
-                    chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+                chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
-                    chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+                chapdetect = re.compile(r'%s' % chapter_marker)
-                else:
+
                    chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
                    chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
                if analyze:
                    hits = len(chapdetect.findall(html))
-                    print unicode(type_name)+" had "+unicode(hits)+" hits"
+                    if hits:
                        chapdetect.sub(self.analyze_title_matches, html)
-                    print unicode(self.chapters_no_title)+" chapters with no title"
+                        if float(self.chapters_with_title) / float(hits) > .5:
-                    print unicode(self.chapters_with_title)+" chapters with titles"
+                            title_req = True
                            strict_title = False
                        self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
                        if type_name == 'common':
                            analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
                        elif self.min_chapters <= hits < self.max_chapters:
                            analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
                            break
                else:
                    html = chapdetect.sub(self.chapter_head, html)
            return html
        recurse_patterns(html, True)
        chapter_types = analysis_result
        html = recurse_patterns(html, False)
        words_per_chptr = wordcount
@ -293,7 +336,7 @@ class HeuristicProcessor(object):
        pre = re.compile(r'<pre>', re.IGNORECASE)
        if len(pre.findall(html)) >= 1:
            self.log.debug("Running Text Processing")
-            outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
+            outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
            html = outerhtml.sub(self.txt_process, html)
        else:
            # Add markup naively