From 80065cb443021536762bf0fdf8d479b1b06bbd0d Mon Sep 17 00:00:00 2001 From: ldolse Date: Wed, 19 Jan 2011 00:18:11 +0800 Subject: [PATCH] tweaked chapter_markup for false positives/negatives --- src/calibre/ebooks/conversion/utils.py | 10 ++++------ src/calibre/manual/conversion.rst | 12 ++++++++---- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index bcc6f5a236..812a863717 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -174,7 +174,7 @@ class HeuristicProcessor(object): if wordcount > 200000: typical_chapters = 15000. self.min_chapters = int(ceil(wordcount / typical_chapters)) - print "minimum chapters required are: "+str(self.min_chapters) + self.log.debug("minimum chapters required are: "+str(self.min_chapters)) heading = re.compile(']*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") @@ -208,12 +208,12 @@ class HeuristicProcessor(object): n_lookahead_close = ")" default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(]*>)?(?=<)" - simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter).{0,50}?(]*>)?(?=<)" + simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(]*>)?(?=<)" analysis_result = [] chapter_types = [ - [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'], + [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'], [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles [r"]*>\s*(]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)()?\s*", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters @@ -274,10 +274,9 @@ class HeuristicProcessor(object): title_req = True strict_title = False self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ") - print "max chapters is "+str(self.max_chapters) if type_name == 'common': analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name]) - elif self.min_chapters <= hits < self.max_chapters: + elif self.min_chapters <= hits < max_chapters: analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name]) break else: @@ -423,7 +422,6 @@ class HeuristicProcessor(object): except: self.log.warn("Can't get wordcount") - print "found "+unicode(self.totalwords)+" words in the flow" if self.totalwords < 50: self.log.warn("flow is too short, not running heuristics") return html diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst index 94a3a60721..e7c09a57a5 100644 --- a/src/calibre/manual/conversion.rst +++ b/src/calibre/manual/conversion.rst @@ -285,10 +285,14 @@ remove all non-breaking-space entities. :guilabel:`Detect and markup unformatted chapter headings and sub headings` If your document does not have Chapter Markers and titles formatted differently from the rest of the text, |app| can use this option to attempt detection them and surround them with heading tags. <h2> tags are used - for chapter headings; <h3> tags are used for any titles that are detected. This function will - not create a TOC, but in many cases it will cause |app|'s default chapter detection settings to correctly - detect chapters and build a TOC. Adjust the Xpath under Structure Detection if a TOC is not automatically - created. The inserted heading tags are not formatted, to apply formatting use the 'extra_css' option under + for chapter headings; <h3> tags are used for any titles that are detected. + + This function will not create a TOC, but in many cases it will cause |app|'s default chapter detection settings + to correctly detect chapters and build a TOC. Adjust the Xpath under Structure Detection if a TOC is not automatically + created. If there are no other headings used in the document then setting "//h:h2" under Structure Detection would + be the easiest way to create a TOC for the document. + + The inserted headings are not formatted, to apply formatting use the 'extra_css' option under the Look and Feel conversion settings. For example, to center heading tags, use the following:: h2, h3 { text-align: center }