From 8f677800e65e1e7f0d3ba673f3337d4c4133ce93 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 25 Sep 2024 23:19:20 +0530
Subject: [PATCH] PDF new engine: Improve header/footer detection

---
 src/calibre/ebooks/pdf/reflow.py | 160 +++++++++++++++++++------------
 1 file changed, 97 insertions(+), 63 deletions(-)

diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py
index 0e1f6fa9f7..cd92f441c3 100644
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@@ -15,8 +15,9 @@ from lxml import etree
 
 #### Pages/lines
 
-# How many pages to scan when finding header/footer automatically
+# How many pages/lines to scan when finding header/footer automatically
 PAGE_SCAN_COUNT = 20		# Arbitrary
+LINE_SCAN_COUNT = 2		# Arbitrary
 
 # Fraction of a character width that two strings have to be apart,
 # for them to be considered part of the same text fragment
@@ -712,6 +713,7 @@ class Page:
             # Need to keep any href= (and others?)
             if len(tx) == 0 \
               or text.top < self.top \
+              or text.top > self.height \
               or text.left > self.left+self.width \
               or text.left < self.left:
               #and re.match(r'href=', text.raw) is None:
@@ -1006,28 +1008,34 @@ class Page:
         # if there is a regex supplied
         if len(opts.pdf_header_regex) > 0 \
           and len(self.texts) > 0:
-            # Remove the first line if it matches
-            if re.match(opts.pdf_header_regex, self.texts[0].text_as_string) is not None :
-                # There can be fragments which are spread out, so join_fragments has not coalesced them
-                # Not sure that this would work as it relies on the first fragment matching regex
-                t = self.texts[0]
-                #match = self.find_match(t)
-                #while match is not None:
-                #    self.texts.remove(match)
-                #    match = self.find_match(t)
-                self.texts.remove(t)
+            # Remove lines if they match
+            for i in range(LINE_SCAN_COUNT):
+                if len(self.texts) < 1:
+                    break
+                if re.match(opts.pdf_header_regex, self.texts[0].text_as_string) is not None :
+                    # There could be fragments which are spread out, so join_fragments has not coalesced them
+                    # Not sure that this would work as it relies on the first fragment matching regex
+                    t = self.texts[0]
+                    #match = self.find_match(t)
+                    #while match is not None:
+                    #    self.texts.remove(match)
+                    #    match = self.find_match(t)
+                    self.texts.remove(t)
 
         if len(opts.pdf_footer_regex) > 0 \
           and len(self.texts) > 0:
-            # Remove the last line if it matches
-            if re.match(opts.pdf_footer_regex, self.texts[-1].text_as_string) is not None :
-                # There can be fragments which are spread out, so join_fragments has not coalesced them
-                t = self.texts[-1]
-                #match = self.find_match(t)
-                #while match is not None:
-                #    self.texts.remove(match)
-                #    match = self.find_match(t)
-                self.texts.remove(t)
+            # Remove the last lines if they match
+            for i in range(LINE_SCAN_COUNT):
+                if len(self.texts) < 1:
+                    break
+                if re.match(opts.pdf_footer_regex, self.texts[-1].text_as_string) is not None :
+                    # There could be fragments which are spread out, so join_fragments has not coalesced them
+                    t = self.texts[-1]
+                    #match = self.find_match(t)
+                    #while match is not None:
+                    #    self.texts.remove(match)
+                    #    match = self.find_match(t)
+                    self.texts.remove(t)
 
     def create_page_format(self, stats, opts):
         # Join fragments into lines
@@ -1683,62 +1691,80 @@ class PDFDocument:
         if (self.opts.pdf_header_skip >= 0 \
             and self.opts.pdf_footer_skip >= 0) \
           or len(self.pages) < 2:
-            # Doc is empty or 1 page.  Can't decide on any skips
+            # If doc is empty or 1 page, can't decide on any skips
             return
 
         scan_count = PAGE_SCAN_COUNT
-        head_text = ''
-        head_match = 0
-        head_match1 = 0
+        head_text = [''] * LINE_SCAN_COUNT
+        head_match = [0] * LINE_SCAN_COUNT
+        head_match1 = [0] * LINE_SCAN_COUNT
+        #head_text = ''
+        #head_match = 0
+        #head_match1 = 0
         head_page = 0
         head_skip = 0
-        foot_text = ''
-        foot_match = 0
-        foot_match1 = 0
+        foot_text = [''] * LINE_SCAN_COUNT
+        foot_match = [0] * LINE_SCAN_COUNT
+        foot_match1 = [0] * LINE_SCAN_COUNT
+        #foot_text = ''
+        #foot_match = 0
+        #foot_match1 = 0
         foot_page = 0
         foot_skip = 0
-        pagenum_text = r'.*\d+\s+\w+\s+\d+.*'
+        pagenum_text = r'(.*\d+\s+\w+\s+\d+.*)|(\s*\d+\s+.*)|(^\s*[ivxlcIVXLC]+\s*$)'
 
         pages_to_scan = scan_count
         # Note the a line may be in more than 1 part
         # e.g. Page 1 of 6 ... DocName.pdf
-        # so merge first 2 lines if same top
+        # so should merge first 2 lines if same top
         # Ditto last 2 lines
         # Maybe should do more than 2 parts
         for page in self.pages:
-            if self.opts.pdf_header_skip < 0 and len(page.texts) > 0:
-                t = page.texts[0].text_as_string
-                if len(page.texts) > 1 and page.texts[0].top == page.texts[1].top:
-                    t += page.texts[1].text_as_string
-                if len(head_text) == 0:
-                    head_text = t
-                else:
-                    if head_text == t:
-                        head_match += 1
-                        if head_page == 0:
-                            head_page = page.number
-                    else:	# Look for page count of format 'n xxx n'
-                        if re.match(pagenum_text, t) is not None:
-                            head_match1 += 1
+            if self.opts.pdf_header_skip < 0 \
+              and len(page.texts) > 0:
+                # There is something at the top of the page
+                for head_ind in range(LINE_SCAN_COUNT):
+                    if len(page.texts) < head_ind+1 \
+                      or page.texts[head_ind].top > page.height/2:
+                        break  # Short page
+                    t = page.texts[head_ind].text_as_string
+                    #if len(page.texts) > 1 and page.texts[0].top == page.texts[1].top:
+                    #    t += ' ' + page.texts[1].text_as_string
+                    if len(head_text[head_ind]) == 0:
+                        head_text[head_ind] = t
+                    else:
+                        if head_text[head_ind] == t:
+                            head_match[head_ind] += 1
                             if head_page == 0:
                                 head_page = page.number
+                        else:	# Look for page count of format 'n xxx n'
+                            if re.match(pagenum_text, t) is not None:
+                                head_match1[head_ind] += 1
+                                if head_page == 0:
+                                    head_page = page.number
 
-            if self.opts.pdf_footer_skip < 0 and len(page.texts) > 0:
-                t = page.texts[-1].text_as_string
-                if len(page.texts) > 1 and page.texts[-1].top == page.texts[-2].top:
-                    t += page.texts[-2].text_as_string
-                if len(foot_text) == 0:
-                    foot_text = t
-                else:
-                    if foot_text == t:
-                        foot_match += 1
-                        if foot_page == 0:
-                            foot_page = page.number
-                    else:	# Look for page count of format 'n xxx n'
-                        if re.match(pagenum_text, t) is not None:
-                            foot_match1 += 1
+            if self.opts.pdf_footer_skip < 0 \
+              and len(page.texts) > 0:
+                # There is something at the bottom of the page
+                for foot_ind in range(LINE_SCAN_COUNT):
+                    if len(page.texts) < foot_ind+1 \
+                      or page.texts[-foot_ind-1].top < page.height/2:
+                        break  # Short page
+                    t = page.texts[-foot_ind-1].text_as_string
+                    #if len(page.texts) > 1 and page.texts[-1].top == page.texts[-2].top:
+                    #    t += ' ' + page.texts[-2].text_as_string
+                    if len(foot_text[foot_ind]) == 0:
+                        foot_text[foot_ind] = t
+                    else:
+                        if foot_text[foot_ind] == t:
+                            foot_match[foot_ind] += 1
                             if foot_page == 0:
                                 foot_page = page.number
+                        else:	# Look for page count of format 'n xxx n'
+                            if re.match(pagenum_text, t) is not None:
+                                foot_match1[foot_ind] += 1
+                                if foot_page == 0:
+                                    foot_page = page.number
 
             pages_to_scan -= 1
             if pages_to_scan < 1:
@@ -1752,12 +1778,20 @@ class PDFDocument:
             pages_to_scan = scan_count
         pages_to_scan /= 2	# Are at least half matching?
 
-        if head_match > pages_to_scan or head_match1 > pages_to_scan:
-            t = self.pages[head_page].texts[0]
+        head_ind = 0
+        for i in range(LINE_SCAN_COUNT):
+            if head_match[i] > pages_to_scan or head_match1[i] > pages_to_scan:
+                head_ind = i  # Remember the last matching line
+        if head_match[head_ind] > pages_to_scan or head_match1[head_ind] > pages_to_scan:
+            t = self.pages[head_page].texts[head_ind]
             head_skip = t.top + t.height + 1
 
-        if foot_match > pages_to_scan or foot_match1 > pages_to_scan:
-            t = self.pages[foot_page].texts[-1]
+        foot_ind = 0
+        for i in range(LINE_SCAN_COUNT):
+            if foot_match[i] > pages_to_scan or foot_match1[i] > pages_to_scan:
+                foot_ind = i  # Remember the last matching line
+        if foot_match[foot_ind] > pages_to_scan or foot_match1[foot_ind] > pages_to_scan:
+            t = self.pages[foot_page].texts[-foot_ind-1]
             foot_skip = t.top - 1
 
         if head_skip > 0:
@@ -1773,8 +1807,8 @@ class PDFDocument:
             while removed:
                 removed = False
                 for t in page.texts:
-                    if self.opts.pdf_header_skip > 0 and t.top < self.opts.pdf_header_skip \
-                    or self.opts.pdf_footer_skip > 0 and t.top > self.opts.pdf_footer_skip:
+                    if (self.opts.pdf_header_skip > 0 and t.top < self.opts.pdf_header_skip) \
+                    or (self.opts.pdf_footer_skip > 0 and t.top > self.opts.pdf_footer_skip):
                         page.texts.remove(t)
                         removed = True
                         break    # Restart loop