From 2d975f654e3e7389295357187c56bfe7628ef117 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 21 Dec 2024 08:32:15 +0530 Subject: [PATCH] Rework setting up document stats for PDF input Improve the setup of document stats to fix bug 2089436 Fixes #2582 (Rework setting up document stats) --- src/calibre/ebooks/pdf/reflow.py | 333 ++++++++++++++++--------------- 1 file changed, 170 insertions(+), 163 deletions(-) diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index abbb66a1de..753002a6d7 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -32,6 +32,10 @@ COALESCE_FACTOR = 20.0 # Pixels from the PDF file BOTTOM_FACTOR = 2.0 +# Where lines overlap, top of one to bottom of next should exceed +# the line height by this factor for them to be considered diferent. +HEIGHT_FACTOR = 1.5 + # Fraction of text height that two strings' bottoms can differ by # for them to be considered to be part of the same text fragment LINE_FACTOR = 0.2 @@ -81,6 +85,25 @@ SAME_SPACE = 3.0 SAME_INDENT = 2.0 +def adjacent_quotes(first_string, second_string): + ''' Does one string end with a closing quote and the next start with an opening quote? ''' + + # Find last non-space char in first string + lchar = re.match(r'.*([^ ])\s*$', first_string) + last_char = ' ' # Nothing interesting + if lchar is not None: + last_char = lchar.group(1) # Final non-space char + + # Find first non-space char in second string + fchar = re.match(r'\s*([^ ])', second_string) + first_char = ' ' # Nothing interesting + if fchar is not None: + first_char = fchar.group(1) # First non-space char + + return bool((last_char == '\u0022' and first_char == '\u0022') \ + or (last_char == '\u2019' and first_char == '\u2018') \ + or (last_char == '\u201d' and first_char == '\u201c')) + class Font: def __init__(self, spec): @@ -105,8 +128,8 @@ class Element: class DocStats: def __init__(self): - self.top = self.bottom = self.left_odd = self.left_even = self.right \ - = self.line_space = self.para_space = self.indent_odd = self.indent_even = 0 + self.top = self.bottom = self.left_min_odd = self.left_min_even = self.right \ + = self.line_space = self.para_space = self.indent_min_odd = self.indent_min_even = 0 self.font_size = 0 class Image(Element): @@ -238,7 +261,7 @@ class Text(Element): self.left = min(self.left, other.left) self.right = max(self.right, other.right) self.width += other.width - self.final_width += other.final_width + self.final_width = other.left + other.width self.height = self.bottom - self.top # Need to check for = secnd.bottom-BOTTOM_FACTOR) \ - or (secnd.top <= frst.top and secnd.bottom >= frst.bottom-BOTTOM_FACTOR) : + or (secnd.top <= frst.top and secnd.bottom >= frst.bottom-BOTTOM_FACTOR): # Overlap = same line if frst.left < secnd.left : return -1 @@ -840,7 +863,7 @@ class Page: # BOTTOM_FACTOR allows for this top = min(frag.top, t.top) bot = max(frag.bottom, t.bottom) - if bot - top < line_height * 1.5 \ + if bot - top < line_height * HEIGHT_FACTOR \ and ((frag.top == t.top or frag.bottom == t.bottom) \ or (frag.top < t.top and frag.bottom > t.top+BOTTOM_FACTOR) \ or (frag.top < t.top and frag.bottom+BOTTOM_FACTOR > t.bottom) \ @@ -876,6 +899,12 @@ class Page: x = frag frag = match match = x + if match.left < frag.right: + # Text overlaps. Do we have a blurred character? + if len(match.text_as_string) == 1 \ + and match.left + match.width > frag.right \ + and frag.text_as_string[-1] == match.text_as_string[0]: + break # Overlapping same character, so ignore it frag.coalesce(match, self.number, self.left_margin, self.right_margin) break # Leave tind tind += 1 @@ -893,9 +922,9 @@ class Page: first = True # Assume not Contents self.contents = False - left = self.stats_left - indent = self.stats_indent - indent1 = self.stats_indent1 + left_max = self.stats_left_max + indent_min = self.stats_indent_min + indent_max = self.stats_indent_max m = len(self.texts) for i in range(m): @@ -918,9 +947,9 @@ class Page: self.contents = True t.tag = 'h2' # It won't get set later # Centered if left and right margins are within FACTOR% - # Because indents can waver a bit, use between indent and indent1 as == indent - if (lmargin < indent or lmargin > indent1) \ - and lmargin > left \ + # Because indents can waver a bit, use between indent_min and indent_max as == indent + if (lmargin < indent_min or lmargin > indent_max) \ + and lmargin > left_max \ and lmargin != xmargin \ and lmargin != ymargin \ and lmargin >= rmargin - rmargin*CENTER_FACTOR \ @@ -930,7 +959,7 @@ class Page: #and t.left + t.width + t.left <= self.width + l_offset + t.average_character_width: t.align = 'C' # Right aligned if left > FACTOR% of right - elif lmargin > indent \ + elif lmargin > indent_max \ and lmargin > rmargin*RIGHT_FACTOR: #and t.right >= self.width - t.average_character_width: # What about right-aligned but indented on right? @@ -960,17 +989,18 @@ class Page: for i in self.imgs: lmargin = i.left rmargin = self.width - i.right - if lmargin > left \ - and lmargin != indent \ + if lmargin > left_max \ + and lmargin != indent_min \ and lmargin >= rmargin - rmargin*CENTER_FACTOR \ and lmargin <= rmargin + rmargin*CENTER_FACTOR: i.align = 'C' def coalesce_paras(self, stats): # Join lines into paragraphs - left = self.stats_left - indent = self.stats_indent - indent1 = self.stats_indent1 + left_min = self.stats_left_min + left_max = self.stats_left_max + indent_min = self.stats_indent_min + indent_max = self.stats_indent_max def can_merge(self, first_text, second_text, stats): # Can two lines be merged into one paragraph? @@ -980,18 +1010,14 @@ class Page: # # The left can wander by a few (SAME_INDENT) pixels. # "float:left" occurs where there is a multi-line character, so indentation is messed up - lchar = re.match(r'.*([^ ])\s*$', first_text.text_as_string) - last_char = ' ' # Nothing interesting - if lchar is not None: - last_char = lchar.group(1) # Final non-space char same_left = bool(first_text.last_left-SAME_INDENT <= second_text.left <= first_text.last_left+SAME_INDENT) - if ((second_text.left < left + second_text.average_character_width \ + if ((second_text.left < left_min + second_text.average_character_width \ and (same_left \ or (second_text.left < first_text.last_left \ and (first_text.indented > 0 or '"float:left"' in first_text.raw)))) \ or (same_left \ and first_text.indented == 0 \ - and second_text.left >= indent) \ + and second_text.left >= indent_min) \ or (same_left \ and first_text.indented == second_text.indented \ and second_text.indented > 1) \ @@ -1002,9 +1028,7 @@ class Page: and first_text.bottom + stats.line_space + (stats.line_space*LINE_FACTOR) \ >= second_text.bottom \ and first_text.final_width > self.width*self.opts.unwrap_factor \ - and not ( (last_char == '\u0022' and second_text.text_as_string[0] == '\u0022') \ - or (last_char == '\u2019' and second_text.text_as_string[0] == '\u2018') \ - or (last_char == '\u201d' and second_text.text_as_string[0] == '\u201c')): + and not adjacent_quotes(first_text.text_as_string, second_text.text_as_string): # This has checked for single quotes (9...6), double quotes (99...66), and "..." # at end of 1 line then start of next as a check for Don't merge return True @@ -1043,13 +1067,13 @@ class Page: if frag.tag == 'p': if frag.indented == 0 \ and frag.align != 'C' \ - and frag.left > left + frag.average_character_width: + and frag.left > left_max + frag.average_character_width: # Is it approx self.stats_indent? - if indent <= frag.left <= indent1: + if indent_min <= frag.left <= indent_max: frag.indented = 1 # 1em else: # Assume left margin of approx = number of chars # Should check for values approx the same, as with indents - frag.margin_left = int(round(((frag.left - left) / self.stats_margin_px)+0.5)) + frag.margin_left = int(round(((frag.left - left_min) / self.stats_margin_px)+0.5)) if last_frag is not None \ and stats.para_space > 0 \ and frag.bottom - last_frag.bottom > stats.para_space*SECTION_FACTOR: @@ -1299,14 +1323,16 @@ class Page: # If there are alternating pages, pick the left and indent for this one if self.odd_even: - self.stats_left = stats.left_odd - self.stats_indent = stats.indent_odd - self.stats_indent1 = stats.indent_odd1 + self.stats_left_min = stats.left_min_odd + self.stats_left_max = stats.left_max_odd + self.stats_indent_min = stats.indent_min_odd + self.stats_indent_max = stats.indent_max_odd self.stats_right = stats.right # Needs work else: - self.stats_left = stats.left_even - self.stats_indent = stats.indent_even - self.stats_indent1 = stats.indent_even1 + self.stats_left_min = stats.left_min_even + self.stats_left_max = stats.left_max_even + self.stats_indent_min = stats.indent_min_even + self.stats_indent_max = stats.indent_max_even self.stats_right = stats.right # Needs work self.stats_margin_px = stats.margin_px @@ -1526,135 +1552,111 @@ class PDFDocument: return scount, soffset # Find (next) most popular indent - def find_indent(indents, skip): + def find_indent(indents): icount, ioffset = 0, 0 for i in indents: - if icount <= indents[i] \ - and (skip <= 0 or indents[i] < skip): - icount = indents[i] + ii = indents[i] + if ii > 0 \ + and icount <= ii: + icount = ii ioffset = i - return icount, ioffset + return ioffset def set_indents(indents, odd_even): # Find most popular left so that will be treated as left of page - indent_c = 0 - indent_k = indent_k1 = 0 - count = len(indents) - while count > 0: - c, k = find_indent(indents, indent_c) - if indent_c <= 0: - indent_c = c - if indent_k <= 0: - indent_k = k - elif abs(indent_k - k) <= SAME_INDENT: - indent_k = min(indent_k, k) - indent_k1 = max(indent_k1, k) - indent_c = min(indent_c, c) - else: - break - count -= 1 - - save_left = indent_k - if odd_even: - self.stats.left_odd = indent_k # Min left value - # Max left value - if indent_k1: - self.stats.left_odd1 = indent_k1 - else: - self.stats.left_odd1 = indent_k - else: - self.stats.left_even = indent_k # Min left value - # Max left value - if indent_k1: - self.stats.left_even1 = indent_k1 - else: - self.stats.left_even1 = indent_k + left_k1 = left_c = 0 + left_k = find_indent(indents) + # Find any adjacent indents and hide them + for k in indents.keys(): + kc = indents[k] + if kc > 0 \ + and abs(left_k - k) <= SAME_INDENT: + left_k = min(left_k, k) + left_k1 = max(left_k1, k) + left_c += kc + indents[k] = -kc # Ensure not found again # Find second most popular left so that will be treated as indent - indent_c -= 1 - total_c = 0 - indent_k = indent_k1 = 0 - count = len(indents) - while count > 0: - c, k = find_indent(indents, indent_c) - if indent_c <= 0: - indent_c = c - if indent_k <= 0: - indent_k = k - elif abs(indent_k - k) <= SAME_INDENT: + indent_k1 = indent_c = 0 + indent_k = find_indent(indents) + # Find any adjacent indents and hide them + for k in indents.keys(): + kc = indents[k] + if kc > 0 \ + and abs(indent_k - k) <= SAME_INDENT: indent_k = min(indent_k, k) indent_k1 = max(indent_k1, k) - indent_c = min(indent_c, c) - else: - break - total_c += c - count -= 1 + indent_c += kc + indents[k] = -kc # Ensure not found again # Find third most popular left as that might actually be the indent # if between left and current and occurs a reasonable number of times. - save_k = indent_k - save_k1 = indent_k1 - save_count = total_c - indent_c -= 1 - total_c = 0 - indent_k = indent_k1 = 0 - count = len(indents) - while count > 0: - c, k = find_indent(indents, indent_c) - if indent_c <= 0: - indent_c = c - if indent_k <= 0: - indent_k = k - elif abs(indent_k - k) <= SAME_INDENT: - indent_k = min(indent_k, k) - indent_k1 = max(indent_k1, k) - indent_c = min(indent_c, c) - else: - break - total_c += c - count -= 1 - # Is this to be used? - if (save_k < indent_k \ - and save_k > save_left) \ - or total_c < save_count / 2: - # The usual case. The first ones found are to be used - indent_k = save_k - indent_k1 = save_k1 + third_k1 = third_c = 0 + third_k = find_indent(indents) + # Find any adjacent indents and hide them + for k in indents.keys(): + kc = indents[k] + if kc > 0 \ + and abs(third_k - k) <= SAME_INDENT: + third_k = min(third_k, k) + third_k1 = max(third_k1, k) + third_c += kc + indents[k] = -kc # Ensure not found again - if odd_even: - self.stats.indent_odd = indent_k # Min indent value - # Max indent value - if indent_k1: - self.stats.indent_odd1 = indent_k1 - else: - self.stats.indent_odd1 = indent_k - else: - self.stats.indent_even = indent_k # Min indent value - # Max indent value - if indent_k1: - self.stats.indent_even1 = indent_k1 - else: - self.stats.indent_even1 = indent_k + # Is this to be used? + if third_k > 0 \ + and third_k < indent_k \ + and third_k > left_k \ + and third_c > indent_c / 2: + # The unusual case. The third ones found are to be used + indent_k = third_k + indent_k1 = third_k1 + + # Check that we have data in variables + if not indent_k: + # Nothing for indent, so make it beyond left + # otherwise left will appear to be the indent + indent_k = indent_k1 = left_k1 + SAME_INDENT + 1 # For safety, check left and indent are in the right order + if left_k > indent_k: + l = left_k + l1 = left_k1 + left_k = indent_k + left_k1 = indent_k1 + indent_k = l + indent_k1 = l1 + if odd_even: - if self.stats.indent_odd != 0 \ - and self.stats.left_odd > self.stats.indent_odd: - l = self.stats.left_odd - l1 = self.stats.left_odd1 - self.stats.left_odd = self.stats.indent_odd - self.stats.left_odd1 = self.stats.indent_odd1 - self.stats.indent_odd = l - self.stats.indent_odd1 = l1 + # Min left value + self.stats.left_min_odd = left_k + # Max left value + if left_k1: + self.stats.left_max_odd = left_k1 + else: + self.stats.left_max_odd = left_k + # Min indent value + self.stats.indent_min_odd = indent_k + # Max indent value + if indent_k1: + self.stats.indent_max_odd = indent_k1 + else: + self.stats.indent_max_odd = indent_k else: - if self.stats.indent_even != 0 \ - and self.stats.left_even > self.stats.indent_even: - l = self.stats.left_even - l1 = self.stats.left_even1 - self.stats.left_even = self.stats.indent_even - self.stats.left_even1 = self.stats.indent_even1 - self.stats.indent_even = l - self.stats.indent_even1 = l1 + # Min left value + self.stats.left_min_even = left_k + # Max left value + if left_k1: + self.stats.left_max_even = left_k1 + else: + self.stats.left_max_even = left_k + # Min indent value + self.stats.indent_min_even = indent_k + # Max indent value + if indent_k1: + self.stats.indent_max_even = indent_k1 + else: + self.stats.indent_max_even = indent_k # Find most popular top so that will be treated as top of page tcount = 0 @@ -1687,10 +1689,10 @@ class PDFDocument: # In this case, any value for indent is random. # Assume that at least 20% of lines would be indented # or that indent offset will be < 10% of line width - if self.stats.indent_odd - self.stats.left_odd > (self.stats.right - self.stats.left_odd) * 0.10: # 10% - self.stats.indent_odd = self.stats.indent_odd1 = self.stats.left_odd - # Assume for both if self.stats.indent_even - self.stats.left_even > (self.stats.right - self.stats.left_even) * 0.10: # 10% - self.stats.indent_even = self.stats.indent_even1 = self.stats.left_even + if self.stats.indent_min_odd - self.stats.left_min_odd > (self.stats.right - self.stats.left_min_odd) * 0.10: # 10% + self.stats.indent_min_odd = self.stats.indent_max_odd = self.stats.left_min_odd + # Assume for both if self.stats.indent_min_even - self.stats.left_min_even > (self.stats.right - self.stats.left_min_even) * 0.10: # 10% + self.stats.indent_min_even = self.stats.indent_max_even = self.stats.left_min_even # Sort spaces into ascending order then loop through. # Lowest value(s) are line spacing, next are para @@ -1944,28 +1946,30 @@ class PDFDocument: save_candidate = None while pind < len(self.pages): page = self.pages[pind] - stats_left = page.stats_left + stats_left_min = page.stats_left_min # Do not merge if the next paragraph is indented if page.texts: + if candidate: + last_line = candidate.texts[-1] if candidate \ + and last_line.bottom > orphan_space \ and page.texts[0].indented == 0: - last_line = candidate.texts[-1] merged_text = page.texts[0] top = merged_text.top # How much space in pixels was at the end of the last line? # If the book is justified text, any space could mean end-of-para # So, how to check for a justified book/page? - last_spare = candidate.right_margin - last_line.final_width # Pixels + last_spare = candidate.textwidth - last_line.final_width # Pixels # How big is the first word on the next line? merged_first = re.match(r'^([^ ]+)\s', merged_text.text_as_string) if merged_first is not None: # First word number of chars as pixels merged_len = len(merged_first.group(1)) * merged_text.average_character_width else: - merged_len = merged_text.right + merged_len = 0 # No merge # Allow where the last line ends with or next line starts with lower case. - if re.match(r'.*[a-z, -]$', last_line.text_as_string) is not None \ - or re.match(r'^[a-z, -]', merged_text.text_as_string) is not None : + if re.match(r'.*[a-z,-]\s*$', last_line.text_as_string) is not None \ + or re.match(r'^\s*[a-z,-]', merged_text.text_as_string) is not None : merged_len = merged_text.right # To use merged_len etc. @@ -1973,10 +1977,10 @@ class PDFDocument: if top <= min_top + page.average_text_height \ and merged_text.tag == 'p' \ and 'href=' not in merged_text.raw \ - and merged_text.left < stats_left + merged_text.average_character_width \ + and merged_text.left < stats_left_min + merged_text.average_character_width \ and not last_spare > merged_len \ - and not (re.match(r'.*[.!?](\u201d|”)$', last_line.text_as_string) is not None - and re.match(r'^(\u201c|“).*', merged_text.text_as_string) is not None): + and not ('"float:right"' in last_line.raw and '"float:right"' in merged_text.raw) \ + and not adjacent_quotes(last_line.text_as_string, merged_text.text_as_string): merge_done = True # We don't want to merge partial pages # i.e. if this is the last line, preserve its top/bottom till after merge @@ -1994,11 +1998,11 @@ class PDFDocument: if page.texts[0].top > self.stats.top + self.stats.line_space: page.texts[0].blank_line_after = 1 candidate = None - last_line = page.texts[-1] - bottom = last_line.bottom # Decide on whether merging is a good idea # Non-indented paragraphs are a problem # Do we have a short page? + last_line = page.texts[-1] + bottom = last_line.bottom if bottom < orphan_space \ and (len(page.imgs) == 0 or page.imgs[-1].bottom < orphan_space): # Force a new page. @@ -2011,7 +2015,6 @@ class PDFDocument: page.page_break_after = True elif (re.match(r'.*[a-z, ]$', last_line.text_as_string) is not None \ or last_line.final_width > page.width*self.opts.unwrap_factor): - # or (last_line.right * 100.0 / page.right_margin) > LAST_LINE_PERCENT): candidate = page else: candidate = None @@ -2020,7 +2023,7 @@ class PDFDocument: if merge_done: # We now need to skip to the next page number # The text has been appended to this page, so coalesce the paragraph - left_margin = merged_page.stats_left + left_margin = merged_page.stats_left_min right_margin = merged_page.stats_right candidate.texts[-1].coalesce(merged_text, candidate.number, left_margin, right_margin) merged_page.texts.remove(merged_text) @@ -2029,6 +2032,10 @@ class PDFDocument: # Ignore top as that can confuse things where the 1st para of a page # was merged with a previous. Keep the original top candidate.texts[-1].bottom = save_bottom + if merged_page.is_empty and save_bottom < orphan_space: + # This is a short page, so do not merge + candidate.page_break_after = True + candidate = None # Have we removed everything from this page (well, all texts and images) if merged_page.is_empty: