diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index bc34af6524..0e1f6fa9f7 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -14,6 +14,10 @@ from lxml import etree # Global constants affecting formatting decisions #### Pages/lines + +# How many pages to scan when finding header/footer automatically +PAGE_SCAN_COUNT = 20 # Arbitrary + # Fraction of a character width that two strings have to be apart, # for them to be considered part of the same text fragment # The problem is justified text where fragments can be widely spaced @@ -21,7 +25,9 @@ from lxml import etree # It also means no columns will be found COALESCE_FACTOR = 20.0 -# Allow some dither of bottom of characters when checking if same line +# Allow some dither of bottom of characters when checking if same line. +# The bottom of 1 line can overlap the top of the next by this amount +# and they are considered different lines. # Pixels from the PDF file BOTTOM_FACTOR = 2.0 @@ -29,18 +35,24 @@ BOTTOM_FACTOR = 2.0 # for them to be considered to be part of the same text fragment LINE_FACTOR = 0.2 -# Fraction of a line width that a line must exceed before -# it can merge with the next -# NOW IN OPTIONS -#LINE_SPLIT = 0.45 +# Long words can force a new line (at a new page) +# although the end of the previous is < this percent. +# Needs to find whether 1st word of 2nd page would fit on +# the last line of previous rather than the length of the last line. +LAST_LINE_PERCENT = 60.0 + +# Pages can split early to avoid orphans. +# Allow a margin when deciding whether a page finishes early, +# and a page break should be put in the HTML. +ORPHAN_LINES = 5 # Fraction of the gap between lines to determine if setting the paragraph break -# is likely to be valid. Somewhere between 1 and 2, probably about 1.3 -PARA_FACTOR = 1.3 +# is likely to be valid. Somewhere between 1 and 2, probably nearer 2 +PARA_FACTOR = 1.8 -# Multiplies the gap between lines to determine if this is a section break -# not a paragraph break Somewhere between 1 and 2, probably about 1.5 -SECTION_FACTOR = 1.4 +# Multiplies the gap between paragraphs to determine if this is a section break +# not a paragraph break +SECTION_FACTOR = 1.3 # Multiplies the average line height when determining row height # of a particular element to detect columns. @@ -55,12 +67,13 @@ LEFT_WAVER = 2.0 RIGHT_FACTOR = 1.8 # Percentage amount left and right margins can differ -# and still be considered centered. 0.1 = 10% -CENTER_FACTOR = 0.1 +# and still be considered centered. 0.15 = 15% +CENTER_FACTOR = 0.15 -#### Indents -# How near must values be to appear the same +#### Indents and line spacing +# How near must pixel values be to appear the same SAME_SPACE = 3.0 +SAME_INDENT = 2.0 class Font: @@ -87,8 +100,8 @@ class Element: class DocStats: def __init__(self): - self.top = self.bottom = self.left = self.left2 = self.right \ - = self.line_space = self.para_space = self.indent = self.indent2 = 0 + self.top = self.bottom = self.left_odd = self.left_even = self.right \ + = self.line_space = self.para_space = self.indent_odd = self.indent_even = 0 self.font_size = 0 class Image(Element): @@ -131,14 +144,17 @@ class Text(Element): self.right = self.left + self.width self.tag = 'p' # Normal paragraph
') ans.append('<%s'%text.tag) # Should be only for Headings, but there is no guarantee that the heading will be recognised @@ -1200,11 +1294,21 @@ class Page: ans[-1] += str(text.indented) #ans[-1] += '1' ans[-1] += 'em"' + # The margins need more work. e.g. can have indented + left + right + elif text.margin_left > 0: + ans[-1] += ' style="margin-left:' + ans[-1] += str(text.margin_left) + ans[-1] += 'em"' + elif text.margin_right > 0: + ans[-1] += ' style="margin-right:' + ans[-1] += str(text.margin_right) + ans[-1] += 'em"' ans[-1] += '>' - #ans.append(text.to_html()+' ') - #ans.append('%s>'%text.tag) ans[-1] += text.to_html() - ans[-1] += '%s>'%text.tag + ans[-1] += '%s>'%text.tag # Closing tag + if text.blank_line_after > 0: + ans.append('
') + # Any remaining images while iind < ilen: ans.append('
0: + c, k = find_indent(indents, indent_c) + if indent_c <= 0: + indent_c = c + if indent_k <= 0: + indent_k = k + elif abs(indent_k - k) <= SAME_INDENT: + indent_k = min(indent_k, k) + indent_k1 = max(indent_k1, k) + indent_c = min(indent_c, c) + else: + break + count -= 1 + + save_left = indent_k + if odd_even: + self.stats.left_odd = indent_k # Min left value + # Max left value + if indent_k1: + self.stats.left_odd1 = indent_k1 + else: + self.stats.left_odd1 = indent_k + else: + self.stats.left_even = indent_k # Min left value + # Max left value + if indent_k1: + self.stats.left_even1 = indent_k1 + else: + self.stats.left_even1 = indent_k + + # Find second most popular left so that will be treated as indent + indent_c -= 1 + total_c = 0 + indent_k = indent_k1 = 0 + count = len(indents) + while count > 0: + c, k = find_indent(indents, indent_c) + if indent_c <= 0: + indent_c = c + if indent_k <= 0: + indent_k = k + elif abs(indent_k - k) <= SAME_INDENT: + indent_k = min(indent_k, k) + indent_k1 = max(indent_k1, k) + indent_c = min(indent_c, c) + else: + break + total_c += c + count -= 1 + + # Find third most popular left as that might actually be the indent + # if between left and current and occurs a reasonable number of times. + save_k = indent_k + save_k1 = indent_k1 + save_count = total_c + indent_c -= 1 + total_c = 0 + indent_k = indent_k1 = 0 + count = len(indents) + while count > 0: + c, k = find_indent(indents, indent_c) + if indent_c <= 0: + indent_c = c + if indent_k <= 0: + indent_k = k + elif abs(indent_k - k) <= SAME_INDENT: + indent_k = min(indent_k, k) + indent_k1 = max(indent_k1, k) + indent_c = min(indent_c, c) + else: + break + total_c += c + count -= 1 + # Is this to be used? + if (save_k < indent_k \ + and save_k > save_left) \ + or total_c < save_count / 2: + # The usual case. The first ones found are to be used + indent_k = save_k + indent_k1 = save_k1 + + if odd_even: + self.stats.indent_odd = indent_k # Min indent value + # Max indent value + if indent_k1: + self.stats.indent_odd1 = indent_k1 + else: + self.stats.indent_odd1 = indent_k + else: + self.stats.indent_even = indent_k # Min indent value + # Max indent value + if indent_k1: + self.stats.indent_even1 = indent_k1 + else: + self.stats.indent_even1 = indent_k + + # For safety, check left and indent are in the right order + if odd_even: + if self.stats.indent_odd != 0 \ + and self.stats.left_odd > self.stats.indent_odd: + l = self.stats.left_odd + l1 = self.stats.left_odd1 + self.stats.left_odd = self.stats.indent_odd + self.stats.left_odd1 = self.stats.indent_odd1 + self.stats.indent_odd = l + self.stats.indent_odd1 = l1 + else: + if self.stats.indent_even != 0 \ + and self.stats.left_even > self.stats.indent_even: + l = self.stats.left_even + l1 = self.stats.left_even1 + self.stats.left_even = self.stats.indent_even + self.stats.left_even1 = self.stats.indent_even1 + self.stats.indent_even = l + self.stats.indent_even1 = l1 + # Find most popular top so that will be treated as top of page tcount = 0 for t in self.tops: @@ -1351,58 +1588,35 @@ class PDFDocument: self.stats.top = t # Some PDFs have alternating pages with different lefts/indents. - # So, if the 2nd highest is 90% of the highest, assume it is a left. - # Same for the indents, assuming there were 2 lefts + # Always separate odd and even, though they are usually the same. + # Find most left/indent for odd pages + set_indents(self.indents_odd, 1) + # Find most left/indent for even pages + set_indents(self.indents_even, 0) - # Find most popular left so that will be treated as left of page - icount = 0 - for i in self.indents: - if icount < self.indents[i]: - icount = self.indents[i] - self.stats.left = i + # Find farthest right so that will be treated as page right + ## SHOULD DO RIGHT2 as well + rcount = 0 + for r in self.rights: + if rcount < r: + rcount = r + self.stats.right = r - # Find second most popular left so that will be treated as indent - icount = 0 - for i in self.indents: - if i != self.stats.left and icount < self.indents[i]: - icount = self.indents[i] - self.stats.indent = i + # Do something about left and right margin values + # They need the same sort of treatment as indents + # self.stats.margin_left = 0 + # self.stats.margin_right = 0 - # For safety, check left and indent are in the right order - if self.stats.indent != 0 \ - and self.stats.left > self.stats.indent: - l = self.stats.left - self.stats.left = self.stats.indent - self.stats.indent = l + # Some PDFs have no indentation of paragraphs. + # In this case, any value for indent is random. + # Assume that at least 20% of lines would be indented + # or that indent offset will be < 10% of line width + if self.stats.indent_odd - self.stats.left_odd > (self.stats.right - self.stats.left_odd) * 0.10: # 10% + self.stats.indent_odd = self.stats.indent_odd1 = self.stats.left_odd + # Assume for both if self.stats.indent_even - self.stats.left_even > (self.stats.right - self.stats.left_even) * 0.10: # 10% + self.stats.indent_even = self.stats.indent_even1 = self.stats.left_even - # Now decide whether there are 2 similar, i.e. within 95% (arbitrary) - if self.stats.indent > 0 \ - and 100.0 * self.indents[self.stats.indent] / self.indents[self.stats.left] > 95.0: - self.stats.left2 = self.stats.indent - - # Find next most popular left so that will be treated as indent - icount = 0 - for i in self.indents: - if i != self.stats.left and i != self.stats.left2 and icount < self.indents[i]: - icount = self.indents[i] - self.stats.indent = i - - # And the last most popular left so that will be treated as indent2 - # Should check it is within 90%. What to do if not? - icount = 0 - for i in self.indents: - if i != self.stats.left and i != self.stats.left2 \ - and i != self.stats.indent and icount < self.indents[i]: - icount = self.indents[i] - self.stats.indent2 = i - - # And check indent and indent2 are in the right order - if self.stats.indent > self.stats.indent2: - l = self.stats.indent - self.stats.indent = self.stats.indent2 - self.stats.indent2 = l - - # Sort spaces into ascending order then loop through + # Sort spaces into ascending order then loop through. # Lowest value(s) are line spacing, next are para # Spaces not yet set up self.stats.line_space = self.stats.para_space = -1.0 @@ -1453,28 +1667,26 @@ class PDFDocument: else: self.stats.para_space = para_k - # Find most popular bottom so that will be treated as bottom of page - # Or the max bottom? Or the max used value within 10% of max value? + # Find the max bottom so that will be treated as bottom of page + # Or most popular bottom? Or the max used value within 10% of max value? bcount = 0 for b in self.bottoms: if bcount < self.bottoms[b]: #and b > self.stats.bottom*0.9: bcount = self.bottoms[b] - #if b > self.stats.bottom: + if b > self.stats.bottom: self.stats.bottom = b - # Find farthest right so that will be treated as page right - ## SHOULD DO RIGHT2 as well - rcount = 0 - for r in self.rights: - if rcount < r: - rcount = r - self.stats.right = r - def find_header_footer(self): # If requested, scan first few pages for possible headers/footers - scan_count = 20 # Arbitrary + if (self.opts.pdf_header_skip >= 0 \ + and self.opts.pdf_footer_skip >= 0) \ + or len(self.pages) < 2: + # Doc is empty or 1 page. Can't decide on any skips + return + + scan_count = PAGE_SCAN_COUNT head_text = '' head_match = 0 head_match1 = 0 @@ -1532,11 +1744,6 @@ class PDFDocument: if pages_to_scan < 1: break - # How many pages have been scanned? - if pages_to_scan > scan_count-2: - # Doc is empty or 1 page. Can't decide on any skips - return - if pages_to_scan > 0: # Doc is shorter than scan_count pages_to_scan = scan_count - pages_to_scan # Number scanned @@ -1562,67 +1769,67 @@ class PDFDocument: # Remove any header/footer lines from all pages for page in self.pages: # If a text is removed, we need to restart the loop or what was the next will be skipped - removed = 1 - while removed == 1: - removed = 0 + removed = True + while removed: + removed = False for t in page.texts: if self.opts.pdf_header_skip > 0 and t.top < self.opts.pdf_header_skip \ or self.opts.pdf_footer_skip > 0 and t.top > self.opts.pdf_footer_skip: page.texts.remove(t) - removed = 1 + removed = True + break # Restart loop - def merge_pages(self, left_margin): + def merge_pages(self, idc): # Check for pages that can be merged # When merging pages, assume short last lines mean no merge # BUT unfortunately there is no way to tell the difference # between a continuation of a paragraph and a 'section break' # if the previous page ends a sentence. - # Also long words can force a new line (at a new page) - # although the end of the previous is < this percent. - # Needs to find whether 1st word of 2nd page would fit on - # the last line of previous rather than the length of the last line. - # Pages can split early to avoid orphans. - ORPHAN_LINES = 2 # First, find the minimum text top and the maximum text bottom min_top = self.stats.top max_bottom = self.stats.bottom + # The space at the end of a page that indicates there is no merge + orphan_space = max_bottom - ORPHAN_LINES*self.stats.line_space # Keep a note of the position of the final line on the merged page save_bottom = 0 # After merge, skip to this page - save_number = 0 + pind = 0 # Now merge where bottom of one is within ORPHAN_LINES lines of max_bottom # and top of next is within a line of min_top # and margins correspond, and it's a normal paragraph merge_done = True while merge_done: - merge_done = False - merged_page = None - candidate = None - for page in self.pages: - if page.number < save_number: - next + merge_done = False # A merge was done + merged_page = None # Page merged into previous + candidate = None # Lines close enough to the bottom that it might merge + while pind < len(self.pages): + page = self.pages[pind] + if page.odd_even: + stats_left = page.stats_left_odd + else: + stats_left = page.stats_left_even + # Do not merge if the next paragraph is indented if page.texts: - if candidate: + if candidate \ + and page.texts[0].indented == 0: last_line = candidate.texts[-1] merged_text = page.texts[0] top = merged_text.top - # Should we check that the new line starts lower case? Doesn't cover all cases. - #and re.match('^[a-z]', merged_text.text_as_string) is not None - # How much space in characters was at the end of the last line? - # If the book is justified text, any space should mean end-of-para + # How much space in pixels was at the end of the last line? + # If the book is justified text, any space could mean end-of-para # So, how to check for a justified book/page? - last_spare = (candidate.right_margin - last_line.right) / last_line.average_character_width + last_spare = candidate.right_margin - last_line.final_width # Pixels # How big is the first word on the next line? - merged_first = re.match(r'.+?(\s)+?', merged_text.text_as_string) + merged_first = re.match(r'^([^ ]+)\s', merged_text.text_as_string) if merged_first is not None: - # First word length as float - merged_len = len(merged_first.group(0)) * 1.0 + # First word number of chars as pixels + merged_len = len(merged_first.group(1)) * merged_text.average_character_width else: merged_len = merged_text.right # Allow where the last line ends with or next line starts with lower case. - if re.match('[a-z, -]$', last_line.text_as_string) is not None \ + if re.match('.*[a-z, -]$', last_line.text_as_string) is not None \ or re.match('^[a-z, -]', merged_text.text_as_string) is not None : merged_len = merged_text.right @@ -1631,9 +1838,9 @@ class PDFDocument: if top <= min_top + page.average_text_height \ and merged_text.tag == 'p' \ and 'href=' not in merged_text.raw \ - and merged_text.left < page.stats_left + merged_text.average_character_width \ + and merged_text.left < stats_left + merged_text.average_character_width \ and not last_spare > merged_len \ - and not (re.match('.*(\u201d|”)$', last_line.text_as_string) is not None + and not (re.match('.*[.!?](\u201d|”)$', last_line.text_as_string) is not None and re.match('^(\u201c|“).*', merged_text.text_as_string) is not None): merge_done = True # We don't want to merge partial pages @@ -1642,46 +1849,60 @@ class PDFDocument: save_bottom = merged_text.bottom else: save_bottom = 0.0 - + # Update this page final top/bottom merged_text.top = candidate.texts[-1].top + page.average_text_height - merged_text.bottom = merged_text.top + page.average_text_height - candidate.texts.append(merged_text) + merged_text.bottom = merged_text.top + merged_text.height merged_page = page break + # If the next page starts below the top, add a blank line before the first line + # This must not be done after a merge as the top has moved + if page.texts[0].top > self.stats.top + self.stats.line_space: + page.texts[0].blank_line_after = 1 candidate = None last_line = page.texts[-1] bottom = last_line.bottom # Decide on whether merging is a good idea # Non-indented paragraphs are a problem - if bottom >= max_bottom \ - - (ORPHAN_LINES*page.average_text_height) \ - - (ORPHAN_LINES*self.stats.line_space) \ - and (re.match('[a-z, ]$', last_line.text_as_string) is not None \ - or last_line.final_width > page.width*self.opts.unwrap_factor): - # or (last_line.right * 100.0 / page.right_margin > LAST_LINE_PERCENT)) : + # Do we have a short page? + if bottom < orphan_space \ + and (len(page.imgs) == 0 or page.imgs[-1].bottom < orphan_space): + # Force a new page. + # Avoid this if the next page starts with an image that wouldn't fit + if pind < len(self.pages)-1: # There is another page + if len(self.pages[pind+1].imgs) == 0 \ + or (self.pages[pind+1].imgs[0].height < orphan_space \ + and (len(self.pages[pind+1].texts) == 0 \ + or self.pages[pind+1].texts[0].top > self.pages[pind+1].imgs[0].top)): + page.page_break_after = True + elif (re.match('.*[a-z, ]$', last_line.text_as_string) is not None \ + or last_line.final_width > page.width*self.opts.unwrap_factor): + # or (last_line.right * 100.0 / page.right_margin) > LAST_LINE_PERCENT): candidate = page else: candidate = None + pind += 1 + if merge_done: - merged_page.texts.remove(merged_text) # We now need to skip to the next page number - # The current page can no longer have anything to merge - save_number = merged_page.number + 1 - # Re-calling coalesce_paras doesn't seem to work - candidate.texts[-2].coalesce(candidate.texts[-1], candidate.number, left_margin) - candidate.texts.remove(candidate.texts[-1]) + # The text has been appended to this page, so coalesce the paragraph + if merged_page.odd_even: + left_margin = merged_page.stats_left_odd + else: + left_margin = merged_page.stats_left_even + candidate.texts[-1].coalesce(merged_text, candidate.number, left_margin) + merged_page.texts.remove(merged_text) # Put back top/bottom after coalesce if final line if save_bottom != 0.0 : # Ignore top as that can confuse things where the 1st para of a page # was merged with a previous. Keep the original top - #candidate.texts[-1].top = save_top candidate.texts[-1].bottom = save_bottom #candidate.coalesce_paras() - # Have we removed everything from this page (well, all texts) - if len(merged_page.texts) == 0: + # Have we removed everything from this page (well, all texts and images) + if len(merged_page.texts) == 0 \ + and len(merged_page.imgs) == 0: + candidate.texts[-1].blank_line_before = 1 self.pages.remove(merged_page) - def linearize(self): self.elements = [] last_region = last_block = None @@ -1721,6 +1942,8 @@ class PDFDocument: '', '
'] for page in self.pages: html.extend(page.to_html()) + if page.page_break_after: + html+= [''] html += ['', '