regex tweaks, including fixes for http://bugs.calibre-ebook.com/ticket/6976

2025-08-30 23:00:21 -04:00 · 2010-09-27 17:00:04 +08:00 · 2010-09-27 17:00:04 +08:00 · 1055919806
commit 1055919806
parent 32f231549f
2 changed files with 12 additions and 10 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -356,7 +356,7 @@ class HTMLPreProcessor(object):
                  (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),

                  # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
+                  (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
                  # Cover the case where every letter in a chapter title is separated by a space
                  (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),

@ -461,10 +461,10 @@ class HTMLPreProcessor(object):
        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
            length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'), 'median')
            if length:
-                # print "The pdf line length returned is " + str(length)
+                print "The pdf line length returned is " + str(length)
                end_rules.append(
                    # Un wrap using punctuation
-                    (re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                )
                # unwrap em/en dashes
                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -77,6 +77,11 @@ class PreProcessor(object):

    def __call__(self, html):
        self.log("*********  Preprocessing HTML  *********")
+
+        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
+        html = re.sub(r"\s*</p>", "</p>\n", html)
+        html = re.sub(r"\s*<p>\s*", "\n<p>", html)
+        
        ###### Check Markup ######
        #
        # some lit files don't have any <p> tags or equivalent (generally just plain text between
@ -135,9 +140,7 @@ class PreProcessor(object):
               #print "blanks between paragraphs is marked True"
            else:
                blanks_between_paragraphs = False
-        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
-        html = re.sub(r"\s*</p>", "</p>\n", html)
-        html = re.sub(r"\s*<p>\s*", "\n<p>", html)
+        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")  
        # detect chapters/sections to match xpath or splitting logic
        #
        # Build the Regular Expressions in pieces
@ -160,11 +163,10 @@ class PreProcessor(object):
        default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
        typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
        numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
-        uppercase_chapters = r"\s*.?([A-Z#\-\s]+)\s*"
+        uppercase_chapters = r"\s*.?([A-Z#\-]+\s{0,3}){1,5}\s*"
        
        chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-        #print chapter_marker
-        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")       
+        #print chapter_marker     
        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
        self.html_preprocess_sections = len(heading.findall(html))
        self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
@ -183,9 +185,9 @@ class PreProcessor(object):
            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
            chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
            chapdetect2 = re.compile(r'%s' % chapter_marker,  re.UNICODE)
+            print str(chapter_marker)
            #chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
            html = chapdetect2.sub(self.chapter_head, html)
-
        ###### Unwrap lines ######
        #
        # Some OCR sourced files have line breaks in the html using a combination of span & p tags