diff --git a/src/calibre/ebooks/conversion/config.py b/src/calibre/ebooks/conversion/config.py index 91f46b984f..b7591d8f03 100644 --- a/src/calibre/ebooks/conversion/config.py +++ b/src/calibre/ebooks/conversion/config.py @@ -221,7 +221,7 @@ OPTIONS = { 'fb2': ('no_inline_fb2_toc',), - 'pdf': ('no_images', 'unwrap_factor'), + 'pdf': ('no_images', 'unwrap_factor', 'new_pdf_engine', 'pdf_header_skip', 'pdf_footer_skip', 'pdf_header_regex', 'pdf_footer_regex'), 'rtf': ('ignore_wmf',), diff --git a/src/calibre/ebooks/conversion/plugins/pdf_input.py b/src/calibre/ebooks/conversion/plugins/pdf_input.py index 42840b10db..d9b3dc494c 100644 --- a/src/calibre/ebooks/conversion/plugins/pdf_input.py +++ b/src/calibre/ebooks/conversion/plugins/pdf_input.py @@ -11,7 +11,7 @@ from polyglot.builtins import as_bytes class PDFInput(InputFormatPlugin): name = 'PDF Input' - author = 'Kovid Goyal and John Schember' + author = 'Kovid Goyal and John Schember and Alan Pettigrew' description = _('Convert PDF files to HTML') file_types = {'pdf'} commit_name = 'pdf_input' @@ -24,20 +24,27 @@ class PDFInput(InputFormatPlugin): 'be unwrapped. Valid values are a decimal between 0 and 1. The ' 'default is 0.45, just below the median line length.')), OptionRecommendation(name='new_pdf_engine', recommended_value=False, - help=_('Use the new PDF conversion engine. Currently not operational.')) + help=_('Use the new, experimental, PDF conversion engine.')), + OptionRecommendation(name='pdf_header_skip', recommended_value=-1, + help=_('Skip everything to the specified number pixels at the top of a page.' + ' Negative numbers mean auto-detect and remove headers, zero means do not remove headers and positive numbers' + ' mean remove headers that appear above that many pixels from the top of the page. Works only' + ' with the new PDF engine.' + )), + OptionRecommendation(name='pdf_footer_skip', recommended_value=-1, + help=_('Skip everything to the specified number of pixels at the bottom of a page.' + ' Negative numbers mean auto-detect and remove footers, zero means do not remove footers and positive numbers' + ' mean remove footers that appear below that many pixels from the bottom of the page. Works only' + ' with the new PDF engine.' + )), + OptionRecommendation(name='pdf_header_regex', recommended_value='', + help=_('Regular expression to remove lines at the top of a page. ' + 'This only looks at the first line of a page and works only with the new PDF engine.')), + OptionRecommendation(name='pdf_footer_regex', recommended_value='', + help=_('Regular expression to remove lines at the bottom of a page. ' + 'This only looks at the last line of a page and works only with the new PDF engine.')), } - def convert_new(self, stream, accelerators): - from calibre.ebooks.pdf.pdftohtml import pdftohtml - from calibre.ebooks.pdf.reflow import PDFDocument - from calibre.utils.cleantext import clean_ascii_chars - - pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True) - with open('index.xml', 'rb') as f: - xml = clean_ascii_chars(f.read()) - PDFDocument(xml, self.opts, self.log) - return os.path.join(os.getcwd(), 'metadata.opf') - def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.metadata.opf2 import OPFCreator @@ -47,8 +54,14 @@ class PDFInput(InputFormatPlugin): # The main html file will be named index.html self.opts, self.log = options, log if options.new_pdf_engine: - return self.convert_new(stream, accelerators) - pdftohtml(os.getcwd(), stream.name, options.no_images) + from calibre.ebooks.pdf.reflow import PDFDocument + from calibre.utils.cleantext import clean_ascii_chars + pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True) + with open(u'index.xml', 'rb') as f: + xml = clean_ascii_chars(f.read()) + PDFDocument(xml, self.opts, self.log) + else: + pdftohtml(os.getcwd(), stream.name, options.no_images) from calibre.ebooks.metadata.meta import get_metadata log.debug('Retrieving document metadata...') diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index f66dd4dc00..bc34af6524 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -1,18 +1,66 @@ #!/usr/bin/env python - __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import numbers import os +import re import sys -from itertools import count +from operator import attrgetter from lxml import etree -from calibre.utils.xml_parse import safe_xml_fromstring +# Global constants affecting formatting decisions + +#### Pages/lines +# Fraction of a character width that two strings have to be apart, +# for them to be considered part of the same text fragment +# The problem is justified text where fragments can be widely spaced +# Was 0.5 but this forces just about anything to coalesce. +# It also means no columns will be found +COALESCE_FACTOR = 20.0 + +# Allow some dither of bottom of characters when checking if same line +# Pixels from the PDF file +BOTTOM_FACTOR = 2.0 + +# Fraction of text height that two strings' bottoms can differ by +# for them to be considered to be part of the same text fragment +LINE_FACTOR = 0.2 + +# Fraction of a line width that a line must exceed before +# it can merge with the next +# NOW IN OPTIONS +#LINE_SPLIT = 0.45 + +# Fraction of the gap between lines to determine if setting the paragraph break +# is likely to be valid. Somewhere between 1 and 2, probably about 1.3 +PARA_FACTOR = 1.3 + +# Multiplies the gap between lines to determine if this is a section break +# not a paragraph break Somewhere between 1 and 2, probably about 1.5 +SECTION_FACTOR = 1.4 + +# Multiplies the average line height when determining row height +# of a particular element to detect columns. +YFUZZ = 1.5 + +# Left (and other) margins can waver. +# Plus or minus this +LEFT_WAVER = 2.0 + +# Amount left margin must be greater than right for text +# to be considered right aligned. 1.8 = 180% +RIGHT_FACTOR = 1.8 + +# Percentage amount left and right margins can differ +# and still be considered centered. 0.1 = 10% +CENTER_FACTOR = 0.1 + +#### Indents +# How near must values be to appear the same +SAME_SPACE = 3.0 class Font: @@ -20,10 +68,10 @@ class Font: def __init__(self, spec): self.id = spec.get('id') self.size = float(spec.get('size')) + self.size_em = 0.0 self.color = spec.get('color') self.family = spec.get('family') - class Element: def __init__(self): @@ -36,6 +84,12 @@ class Element: def __hash__(self): return hash(self.id) +class DocStats: + + def __init__(self): + self.top = self.bottom = self.left = self.left2 = self.right \ + = self.line_space = self.para_space = self.indent = self.indent2 = 0 + self.font_size = 0 class Image(Element): @@ -43,15 +97,16 @@ class Image(Element): Element.__init__(self) self.opts, self.log = opts, log self.id = next(idc) - self.top, self.left, self.width, self.height, self.iwidth, self.iheight = \ - map(float, map(img.get, ('top', 'left', 'rwidth', 'rheight', 'iwidth', - 'iheight'))) + self.top, self.left, self.width, self.height = \ + map(float, map(img.get, ('top', 'left', 'width', 'height'))) self.src = img.get('src') self.bottom = self.top + self.height self.right = self.left + self.width + # Check for alignment done later + self.align = 'L' def to_html(self): - return '' % \ + return '' % \ (self.src, int(self.width), int(self.height)) def dump(self, f): @@ -66,14 +121,181 @@ class Text(Element): self.id = next(idc) self.opts, self.log = opts, log self.font_map = font_map + self.top, self.left, self.width, self.height = map(round, map(float, map(text.get, + ('top', 'left', 'width', 'height')))) + # This does nothing, as expected, + # but somewhere left (at least) is changed sometimes to not .0 + if self.left != round(self.left) : + self.left = round(self.left) + self.bottom = self.top + self.height + self.right = self.left + self.width + self.tag = 'p' # Normal paragraph + self.indented = 0 + # When joining lines for a paragraph, remember position of last line joined + self.last_left = self.left + # Remember the length of this line if it is merged into a paragraph + self.final_width = self.width + # Align = Left, Right, Center, Justified. Default L + self.align = 'L' + # Should there be extra space between paragraphs? + self.blank_line = 0 + + if self.font_map: + self.font = self.font_map[text.get('font')] + self.font_size = self.font.size + self.font_size_em = self.font.size_em + self.color = self.font.color + self.font_family = self.font.family + else: + self.font = {} + self.font_size = 0.0 + self.font_size_em = 0.0 + # self.color = 0 + + text.tail = '' + self.text_as_string = etree.tostring(text, method='text', encoding='unicode') + self.raw = text.text if text.text else '' + for x in text.iterchildren(): + self.raw += etree.tostring(x, method='xml', encoding='unicode') + self.average_character_width = self.width/len(self.text_as_string) + + def coalesce(self, other, page_number, left_margin): + if self.opts.verbose > 2: + self.log.debug('Coalescing %r with %r on page %d'%(self.text_as_string, + other.text_as_string, page_number)) + # For elements of the same line, is there a space between? + # Spaces are narrow, so a_c_w/3 + if (self.top <= other.top and self.bottom >= other.bottom) \ + and abs(other.left - self.right) < self.average_character_width / 3.0: + has_gap = 0 + else: # Insert n spaces to fill gap. Use TAB? + if other.left < self.right: + has_gap = 1 # Coalescing different lines + else: # Multiple texts on same line + has_gap = round(abs(other.left - self.right) / self.average_character_width) + + self.top = min(self.top, other.top) + self.left = min(self.left, other.left) + self.right = max(self.right, other.right) + self.width += other.width + self.final_width += other.final_width + self.bottom = max(self.bottom, other.bottom) + self.height = self.bottom - self.top + # Need to check for $', self.raw) + m_other = re.match('^(.+)$', other.raw) + if m_self and m_other: + self.raw = m_self.group(1) + other.raw = m_other.group(1) + elif self.font_size_em != other.font_size_em \ + and self.font_size_em != 1.00 : + if re.match('= other.font_size_em * 2.0 : + # Insert 'float: left' etc. into current font info + # Unfortunately, processing to generate the .epub file changes things. + # The line height gets set to the same as other parts of the file + # and the font size is reduced. + # These need to be fixed manually. + m_self = re.match('^(.+em">)(.+)$', self.raw) + self.raw = m_self.group(1) \ + + '' \ + + m_self.group(2) + '' + + self.font_size = max(self.font_size, other.font_size) + self.font_size_em = max(self.font_size_em, other.font_size_em) + self.font = other.font if self.font_size == other.font_size else other.font + if has_gap > 0: + if has_gap < 3: # Small number of spaces = 1 space + if not (self.text_as_string.endswith(' ') \ + or self.text_as_string.endswith('-') \ + or other.text_as_string.startswith(' ') \ + or other.text_as_string.startswith('-') ): + has_gap = 1 + else: + has_gap = 0 + # Insert multiple spaces + while has_gap > 0: + self.text_as_string += ' ' + self.raw += ' ' + self.width += self.average_character_width + #self.final_width += self.average_character_width + has_gap -= 1 + + self.text_as_string += other.text_as_string + #self.width += other.width + + # Try to merge href where there are 2 for the same place + # Beware multiple hrefs on the same line, but for different places + # e.g. + # self.raw = 'T' + # other.raw = 'ITLE' + # becomes 'TITLE' + # Are there problems if self.raw does not end ? + # Note that the 2 parts could have different font sizes + matchObj = re.match(r'^([^<]*)(]*>)*(]+>)(.*)()*(\s*)$', self.raw) + if matchObj is not None : + otherObj = re.match('^([^<]*)(]*>)*(]+>)(.*)()()*(.*)$', other.raw) + # There is another href, but is it for the same place? + if otherObj is not None and matchObj.group(3) == otherObj.group(3) : + m2 = matchObj.group(2) + if m2 is None: + m2 = '' + m5 = matchObj.group(5) + if m5 is None: + m5 = '' + o2 = otherObj.group(2) + if o2 is None: + o2 = '' + o6 = otherObj.group(6) + if o6 is None: + o6 = '' + # Remove the other stuff and put the last + other.raw = otherObj.group(1)+o2+otherObj.group(4)+o6+otherObj.group(5)+otherObj.group(7) + # Move the + self.raw = matchObj.group(1)+matchObj.group(3)+m2+matchObj.group(4)+m5+matchObj.group(6) + # + self.raw += other.raw + self.average_character_width = self.width/len(self.text_as_string) + #self.last_left = other.left + + def to_html(self): + #matchObj = re.match(u'^(.*)(]+>)(.*)(.*)$', self.raw) + #if + return self.raw + + def dump(self, f): + f.write('T top={}, left={}, width={}, height={}: '.format(self.top, self.left, self.width, self.height)) + f.write(self.to_html().encode('utf-8')) + f.write('\n') + +class Paragraph(Text): + + def __init__(self, text, font_map, opts, log, idc): + Text.__init__(self) + self.id = next(idc) + self.opts, self.log = opts, log + self.font_map = font_map self.top, self.left, self.width, self.height = map(float, map(text.get, ('top', 'left', 'width', 'height'))) self.bottom = self.top + self.height self.right = self.left + self.width - self.font = self.font_map[text.get('font')] - self.font_size = self.font.size - self.color = self.font.color - self.font_family = self.font.family + if self.font_map: + self.font = self.font_map[text.get('font')] + self.font_size = self.font.size + self.color = self.font.color + self.font_family = self.font.family + else: + self.font = {} + self.font_size = 0 + # self.color = 0 text.tail = '' self.text_as_string = etree.tostring(text, method='text', @@ -83,30 +305,14 @@ class Text(Element): self.raw += etree.tostring(x, method='xml', encoding='unicode') self.average_character_width = self.width/len(self.text_as_string) - def coalesce(self, other, page_number): - if self.opts.verbose > 2: - self.log.debug('Coalescing %r with %r on page %d'%(self.text_as_string, - other.text_as_string, page_number)) - self.top = min(self.top, other.top) - self.right = other.right - self.width = self.right - self.left - self.bottom = max(self.bottom, other.bottom) - self.height = self.bottom - self.top - self.font_size = max(self.font_size, other.font_size) - self.font = other.font if self.font_size == other.font_size else other.font - self.text_as_string += other.text_as_string - self.raw += other.raw - self.average_character_width = (self.average_character_width + - other.average_character_width)/2.0 - def to_html(self): return self.raw def dump(self, f): + f.write('P top={}, left={}, width={}, height={}: '.format(self.top, self.left, self.width, self.height)) f.write(self.to_html().encode('utf-8')) f.write('\n') - class FontSizeStats(dict): def __init__(self, stats): @@ -118,7 +324,6 @@ class FontSizeStats(dict): self.most_common_size, self.chars_at_most_common_size = sz, chars self[sz] = chars/total - class Interval: def __init__(self, left, right): @@ -144,13 +349,13 @@ class Interval: def __hash__(self): return hash('(%f,%f)'%self.left, self.right) - class Column: - # A column contains an element is the element bulges out to + # A column contains an element if the element bulges out to # the left or the right by at most HFUZZ*col width. HFUZZ = 0.2 + def __init__(self): self.left = self.right = self.top = self.bottom = 0 self.width = self.height = 0 @@ -170,10 +375,10 @@ class Column: self._post_add() def _post_add(self): - self.elements.sort(key=lambda x: x.bottom) + self.elements.sort(key=attrgetter('bottom')) self.top = self.elements[0].top self.bottom = self.elements[-1].bottom - self.left, self.right = sys.maxsize, 0 + self.left, self.right = sys.maxint, 0 for x in self: self.left = min(self.left, x.left) self.right = max(self.right, x.right) @@ -198,7 +403,7 @@ class Column: left_margin = elem.left - self.left elem.indent_fraction = left_margin/self.width elem.width_fraction = elem.width/self.width - if i == 0: + if i == 0 or self.average_line_separation == 0: elem.top_gap_ratio = None else: elem.top_gap_ratio = (self.elements[i-1].bottom - @@ -223,14 +428,13 @@ class Box(list): def to_html(self): ans = ['<%s>'%self.tag] for elem in self: - if isinstance(elem, numbers.Integral): + if isinstance(elem, int): ans.append(''%elem) else: ans.append(elem.to_html()+' ') ans.append(''%self.tag) return ans - class ImageBox(Box): def __init__(self, img): @@ -243,7 +447,7 @@ class ImageBox(Box): if len(self) > 0: ans.append('
') for elem in self: - if isinstance(elem, numbers.Integral): + if isinstance(elem, int): ans.append('
'%elem) else: ans.append(elem.to_html()+' ') @@ -260,7 +464,7 @@ class Region: def add(self, columns): if not self.columns: - for x in sorted(columns, key=lambda x: x.left): + for x in sorted(columns, key=attrgetter('left')): self.columns.append(x) else: for i in range(len(columns)): @@ -323,11 +527,12 @@ class Region: idx) col.add(elem) + def collect_stats(self): for column in self.columns: column.collect_stats() - self.average_line_separation = sum(x.average_line_separation for x in - self.columns)/float(len(self.columns)) + self.average_line_separation = sum([x.average_line_separation for x in + self.columns])/float(len(self.columns)) def __iter__(self): yield from self.columns @@ -407,39 +612,104 @@ class Region: self.boxes[-1].append(elem) + class Page: - # Fraction of a character width that two strings have to be apart, - # for them to be considered part of the same text fragment - COALESCE_FACTOR = 0.5 - - # Fraction of text height that two strings' bottoms can differ by - # for them to be considered to be part of the same text fragment - LINE_FACTOR = 0.4 - - # Multiplies the average line height when determining row height - # of a particular element to detect columns. - YFUZZ = 1.5 - def __init__(self, page, font_map, opts, log, idc): + def text_cmp(frst, secnd): + # Compare 2 text objects. + # Order by line (top/bottom) then left + if (frst.top <= secnd.top and frst.bottom >= secnd.bottom) \ + or (secnd.top <= frst.top and secnd.bottom >= frst.bottom) : + # Overlap = same line + if frst.left < secnd.left : + return -1 + elif frst.left == secnd.left : + return 0 + return 1 + # Different line so sort into line number + if frst.bottom < secnd.bottom : + return -1 + elif frst.bottom == secnd.bottom : + return 0 + return 1 + + # The sort comparison caller + from functools import cmp_to_key + self.opts, self.log = opts, log self.font_map = font_map self.number = int(page.get('number')) - self.width, self.height = map(float, map(page.get, - ('width', 'height'))) + self.width, self.height = map(float, map(page.get, ('width', 'height'))) self.id = 'page%d'%self.number self.texts = [] - self.left_margin, self.right_margin = self.width, 0 + self.imgs = [] + # Set margins to values that will get adjusted + self.left_margin = self.width + self.right_margin = 0 + # For whether page number has been put in + self.id_used = 0 for text in page.xpath('descendant::text'): self.texts.append(Text(text, self.font_map, self.opts, self.log, idc)) text = self.texts[-1] - self.left_margin = min(text.left, self.left_margin) - self.right_margin = max(text.right, self.right_margin) + # Allow for ' ...' + matchObj = re.match(r'^(\s*)(<[^>]+>)?(\s*)(.*)$', text.raw) + s1 = matchObj.group(1) + s2 = matchObj.group(3) + t1 = matchObj.group(2) or '' + t2 = matchObj.group(4) or '' + tx = t1 + t2 + # Remove lines of only spaces. Could be etc., but process later + # Need to keep any href= (and others?) + if len(tx) == 0: + #and re.match(r'href=', text.raw) is None: + self.texts.remove(text) + elif (self.opts.pdf_header_skip <= 0 or text.top >= self.opts.pdf_header_skip) \ + and (self.opts.pdf_footer_skip <= 0 or text.top <= self.opts.pdf_footer_skip): + # Remove leading spaces and make into indent + # Assume 1 space = 1 av_char_width? + s = len(s1) + len(s2) + if s > 1: # Allow single leading space + w = round(s * text.average_character_width/2.0) # Spaces < avg width + text.raw = tx + text.text_as_string = text.text_as_string[s:] + text.left += w # Add indent + text.last_left += w + text.width -= w # Reduce width + text.final_width -= w + #text.indented gets set later + self.left_margin = min(text.left, self.left_margin) + self.right_margin = max(text.right, self.right_margin) + # Change #nnn to page_nnn in hrefs + matchObj = re.match(r'^(.*)()$', self.imgs[-1]) + #if matchObj is not None: + # self.imgs[-1] = matchObj.group(1)+u'" alt=""/>' self.textwidth = self.right_margin - self.left_margin + # Sort into page order. bottom then left + # NB. This is only approximate as different sized characters + # can mean sections of a line vary in top or bottom. + # bottom is less varied than top, but is not guaranteed. + # Multi-line characters make things even more interesting. + self.texts.sort(key=cmp_to_key(text_cmp)) + self.font_size_stats = {} self.average_text_height = 0 for t in self.texts: @@ -452,23 +722,26 @@ class Page: self.font_size_stats = FontSizeStats(self.font_size_stats) - self.coalesce_fragments() - - self.elements = list(self.texts) - for img in page.xpath('descendant::img'): - self.elements.append(Image(img, self.opts, self.log, idc)) - self.elements.sort(key=lambda x: x.top) - - def coalesce_fragments(self): + def join_fragments(self, opts): + # Join fragments on a line + # Do some basic checks on structure def find_match(frag): for t in self.texts: - hdelta = t.left - frag.right - hoverlap = self.COALESCE_FACTOR * frag.average_character_width - if t is not frag and hdelta > -hoverlap and \ - hdelta < hoverlap and \ - abs(t.bottom - frag.bottom) < self.LINE_FACTOR*frag.height: - return t + # and abs(t.bottom - frag.bottom) <= BOTTOM_FACTOR : + if t is not frag : + if (t.top <= frag.top and t.bottom >= frag.bottom) \ + or (t.top >= frag.top and t.bottom <= frag.bottom): + return t # Force match if same line + # Sorting can put parts of a line in the wrong order if there are small chars + if t.left < frag.left: + hdelta = frag.left - t.right + else: + hdelta = t.left - frag.right + hoverlap = COALESCE_FACTOR * frag.average_character_width + if hdelta > -hoverlap and \ + hdelta < hoverlap : + return t match_found = True while match_found: @@ -477,12 +750,253 @@ class Page: match = find_match(frag) if match is not None: match_found = True - frag.coalesce(match, self.number) + # Because texts are sorted top, left we can get small chars on the same line + # appearing after larger ones, even though they are further left + if frag.left > match.left: + x = frag + frag = match + match = x + frag.coalesce(match, self.number, self.left_margin) break if match is not None: self.texts.remove(match) - def first_pass(self): + def check_centered(self, stats): + # Check for centered text + # Also check for right aligned, and basic chapter structure + + # If there are different left/indents, need to adjust for this page + # Is the text offset from the left? + # The centering check would fail where all lines on a page are centered + # so use stats_left and stats_indent + first = True + # Assume not Contents + self.contents = False + m = len(self.texts) + #for t in self.texts: + for i in range(m): + t = self.texts[i] + lmargin = t.left + rmargin = self.width - t.right + # Do we have a sequence of indented lines? + xmargin = ymargin = -1 + if i > 0: + xmargin = self.texts[i-1].left + if i < m-1: + ymargin = self.texts[i+1].left + + # Don't want to set headings on a Contents page + # NB Doesn't work where Contents goes to another page + if re.match(r'(?i)^\s*(table of )?contents\s*$', t.text_as_string) is not None: + self.contents = True + t.tag = 'h2' # It won't get set later + # Centered if left and right margins are within FACTOR% + if lmargin > self.stats_left \ + and lmargin != self.stats_indent \ + and lmargin != xmargin \ + and lmargin != ymargin \ + and lmargin >= rmargin - rmargin*CENTER_FACTOR \ + and lmargin <= rmargin + rmargin*CENTER_FACTOR: + #and t.left + t.width + t.left >= self.width + l_offset - t.average_character_width \ + #and t.left + t.width + t.left <= self.width + l_offset + t.average_character_width: + t.align = 'C' + # Right aligned if left > FACTOR% of right + elif lmargin > self.stats_indent \ + and lmargin > rmargin*RIGHT_FACTOR: + #and t.right >= self.width - t.average_character_width: + # What about right-aligned but indented on right? + # What about indented rather than right-aligned? + t.align = 'R' + if not self.contents: + # We can get self.stats_left \ + and lmargin != self.stats_indent \ + and lmargin >= rmargin - rmargin*CENTER_FACTOR \ + and lmargin <= rmargin + rmargin*CENTER_FACTOR: + i.align = 'C' + + def can_merge(self, first_text, second_text, stats): + # Can two lines be merged into one paragraph? + # Some PDFs have a wandering left margin which is consistent on a page + # but not within the whole document. Hence use self.stats_left + # Try to avoid close double quote at end of one and open double quote at start of next + # + # "float:left" occurs where there is a multi-line character, so indentation is messed up + if ((second_text.left < self.stats_left + second_text.average_character_width \ + and (second_text.left == first_text.last_left \ + or (second_text.left < first_text.last_left \ + and (first_text.indented > 0 or '"float:left"' in first_text.raw)))) \ + or (second_text.left == first_text.last_left \ + and second_text.left >= self.stats_indent) \ + or (second_text.left >= first_text.last_left \ + and second_text.bottom <= first_text.bottom)) \ + and 'href=' not in second_text.raw \ + and first_text.bottom + stats.line_space + (stats.line_space*LINE_FACTOR) \ + >= second_text.bottom \ + and first_text.final_width > self.width*self.opts.unwrap_factor \ + and not (first_text.text_as_string[-1] == '\u0022' and second_text.text_as_string[0] == '\u0022') \ + and not (first_text.text_as_string[-1] == '\u2019' and second_text.text_as_string[0] == '\u2018') \ + and not (first_text.text_as_string[-1] == '\u201d' and second_text.text_as_string[0] == '\u201c'): + # This has checked for single quotes (9...6), double quotes (99...66), and "..." + # at end of 1 line then start of next as a check for Don't merge + return True + return False + + def remove_head_foot(self, opts): + + # Remove headers or footers from a page + # if there is a regex supplied + if len(opts.pdf_header_regex) > 0 \ + and len(self.texts) > 0: + # Remove the first line if it matches + if re.match(opts.pdf_header_regex, self.texts[0].text_as_string) is not None : + self.texts.remove(self.texts[0]) + + + if len(opts.pdf_footer_regex) > 0 \ + and len(self.texts) > 0: + # Remove the last line if it matches + if re.match(opts.pdf_footer_regex, self.texts[-1].text_as_string) is not None : + self.texts.remove(self.texts[-1]) + + def coalesce_paras(self, stats): + + # Loop through texts elements and coalesce if same lmargin + # and no large gap between lines + # Have to restart loop if an entry is removed + # Doesn't work well with things like Contents list, hence check href + match_found = True + last_frag = None + scanning = False + while match_found: + match_found, match = False, None + # Same left margin probably means coalesce + for frag in self.texts: + # Remove lines of only spaces + if re.match(r'^\s+$', frag.raw) is not None: + match = frag + break + # Loop till we get to the last processed entry + if scanning \ + and last_frag is not None \ + and frag != last_frag: + continue + scanning = False + if last_frag is not None \ + and frag != last_frag \ + and self.can_merge(last_frag, frag, stats): + last_frag.coalesce(frag, self.number, self.left_margin) + last_frag.last_left = frag.left + last_frag.final_width = frag.final_width + match = frag + scanning = True # Restart loop and skip to this entry + break + else: + # Check for start of a paragraph being indented + # Ought to have some way of setting a standard indent + if frag.tag == 'p': + if frag.indented == 0 \ + and frag.align != 'C' \ + and frag.left > self.stats_left + frag.average_character_width: + #frag.indented = int((frag.left - self.stats_left) / frag.average_character_width) + # Is it approx self.stats_indent? + si = self.stats_indent * 0.15 # 15% + if frag.left >= self.stats_indent-si \ + and frag.left <= self.stats_indent+si: + frag.indented = 1 + else: + frag.indented = 2 + if last_frag is not None \ + and frag.bottom - last_frag.bottom \ + > stats.para_space*SECTION_FACTOR: + #and frag.top - last_frag.bottom > frag.height + stats.line_space + (stats.line_space*LINE_FACTOR): + frag.blank_line = 1 + last_frag = frag + if match is not None: + match_found = True + self.texts.remove(match) + + def create_page_format(self, stats, opts): + # Join fragments into lines + # then remove any headers/footers/unwanted areas + + self.update_font_sizes(stats) + + # Join fragments on a line + self.join_fragments(opts) + + # Remove headers or footers + self.remove_head_foot(opts) + + def find_margins(self, tops, indents, line_spaces, bottoms, rights): + + #from collections import Counter + + # Should check for left margin and indent for this page + # Find the most used top, left margins, and gaps between lines + # The most used font will be treated as size 1em + max_bot = 0 + max_right = 0 + last_bottom = 0 + first = True + for text in self.texts: + top = text.top + left = text.left + if round(left) != left : + text.left = left = round(left) + right = text.right + if round(right) != right : + text.right = right = round(right) + if first: + tops[top] = tops.get(top, 0) + 1 + first = False + else: + # Some docs have bottom > top although there is a space + space = abs(top - last_bottom) + # Beware of multiple text on same line. These look like small spacing + if text.height <= space: + line_spaces[space] = line_spaces.get(space, 0) + 1 + + last_bottom = text.bottom + if last_bottom > max_bot: + max_bot = last_bottom + + last_right = text.right + if last_right > max_right: + max_right = last_right + + indents[left] = indents.get(left, 0) + 1 + + if max_bot > 0: + bottoms[max_bot] = bottoms.get(max_bot, 0) + 1 + if max_right > 0: + rights[max_right] = rights.get(max_right, 0) + 1 + + return + ######################### + #### NOT IMPLEMENTED #### 'Sort page into regions and columns' self.regions = [] if not self.elements: @@ -510,7 +1024,8 @@ class Page: self.dump_regions('pre-coalesce') self.coalesce_regions() - self.dump_regions('post-coalesce') + if self.opts.verbose > 2: + self.dump_regions('post-coalesce') def dump_regions(self, fname): fname = 'regions-'+fname+'.txt' @@ -561,16 +1076,16 @@ class Page: absorb_into = prev_region if self.regions[next_region].line_count >= \ self.regions[prev_region].line_count: - avg_column_count = sum(len(r.columns) for r in - regions)/float(len(regions)) + avg_column_count = sum([len(r.columns) for r in + regions])/float(len(regions)) if self.regions[next_region].line_count > \ self.regions[prev_region].line_count \ or abs(avg_column_count - len(self.regions[prev_region].columns)) \ > abs(avg_column_count - len(self.regions[next_region].columns)): - absorb_into = next_region - absorb_at = 'top' + absorb_into = next_region + absorb_at = 'top' if absorb_into is not None: self.regions[absorb_into].absorb_regions(regions, absorb_at) absorbed.update(regions) @@ -579,7 +1094,7 @@ class Page: def sort_into_columns(self, elem, neighbors): neighbors.add(elem) - neighbors = sorted(neighbors, key=lambda x: x.left) + neighbors = sorted(neighbors, key=attrgetter('left')) if self.opts.verbose > 3: self.log.debug('Neighbors:', [x.to_html() for x in neighbors]) columns = [Column()] @@ -594,12 +1109,12 @@ class Page: if not added: columns.append(Column()) columns[-1].add(x) - columns.sort(key=lambda x: x.left) + columns.sort(key=attrgetter('left')) return columns def find_elements_in_row_of(self, x): interval = Interval(x.top, - x.top + self.YFUZZ*(self.average_text_height)) + x.top + YFUZZ*(self.average_text_height)) h_interval = Interval(x.left, x.right) for y in self.elements[x.idx:x.idx+15]: if y is not x: @@ -610,43 +1125,172 @@ class Page: x_interval.intersection(h_interval).width <= 0: yield y - def second_pass(self): + def update_font_sizes(self, stats): + # Font sizes start as pixels/points, but em is more useful + for text in self.texts: + text.font_size_em = self.font_map[text.font.id].size_em + if text.font_size_em != 0.00 and text.font_size_em != 1.00: + text.raw = '%s'%(str(text.font_size_em),text.raw) + + def second_pass(self, stats, opts): + + # If there are alternating pages, pick the left and indent for this one + if stats.left2 > 0 and self.left_margin == stats.left2: + self.stats_left = stats.left2 + self.stats_indent = stats.indent2 + else: + self.stats_left = stats.left + self.stats_indent = stats.indent + + # Join lines to form paragraphs + self.coalesce_paras(stats) + + self.check_centered(stats) + + #self.elements = list(self.texts) + #for img in page.xpath('descendant::img'): + # self.elements.append(Image(img, self.opts, self.log, idc)) + #self.elements.sort(cmp=lambda x,y:cmp(x.top, y.top)) + + return 'Locate paragraph boundaries in each column' for region in self.regions: region.collect_stats() region.linearize() + def to_html(self): + # If ans.append is used, newlines are inserted between each element + ans = [] + #ans.append('

'%self.number) + iind = 0 + itop = 0 + ilen = len(self.imgs) + for text in self.texts: + if iind < ilen: + itop = self.imgs[iind].top + else: + itop = 999999 + if itop <= text.top: + ans.append(' 0: + ans.append('

 

') + ans.append('<%s'%text.tag) + # Should be only for Headings, but there is no guarantee that the heading will be recognised + # So put in an ID once per page in case the Contents references it + # and text.tag[0] == 'h' + if self.id_used == 0: + self.id_used = 1 + ans[-1] += ' id="page_%d"'%self.number + if text.align == 'C': + ans[-1] += ' style="text-align:center"' + elif text.align == 'R': + ans[-1] += ' style="text-align:right"' + elif text.indented > 0: + ans[-1] += ' style="text-indent:' + ans[-1] += str(text.indented) + #ans[-1] += '1' + ans[-1] += 'em"' + ans[-1] += '>' + #ans.append(text.to_html()+' ') + #ans.append(''%text.tag) + ans[-1] += text.to_html() + ans[-1] += ''%text.tag + # Any remaining images + while iind < ilen: + ans.append(' 0 or self.opts.pdf_footer_skip > 0: + self.remove_header_footer() + + # Work out document dimensions from page format + for page in self.pages: + page.find_margins(self.tops, self.indents, self.line_spaces, \ + self.bottoms, self.rights) + + self.setup_stats() + + # Joins lines etc. into paragraphs + for page in self.pages: + page.second_pass(self.stats, self.opts) + + # Join paragraphs across page boundaries + self.merge_pages(self.stats.left) + + #self.linearize() self.render() def collect_font_statistics(self): @@ -658,8 +1302,386 @@ class PDFDocument: self.font_size_stats[sz] = 0 self.font_size_stats[sz] += chars + for text in p.texts: + font = int(text.font.id) + self.font_sizes[font] = self.font_sizes.get(font, 0) + 1 + self.font_size_stats = FontSizeStats(self.font_size_stats) + # Find most popular font so that will be treated as 1em + fcount = f_ind = 0 + for f in self.font_sizes: + if fcount < self.font_sizes[f]: + fcount = self.font_sizes[f] + f_ind = f + + if len(self.fonts) > 0: + self.stats.font_size = self.fonts[f_ind].size + else: + self.stats.font_size = 12.0 + + for f in self.fonts: + f.size_em = round(f.size / self.stats.font_size, 2) + + def setup_stats(self): + # This probably needs more work on line spacing/para spacing + # Maybe sort the line_spaces array. + # It is possible to have more than 1 line space value, e.g. 8.0, 9.0, 10.0 + # then more than 1 para space, e.g. 24.0, 25.0, 26.0 + # Thus the count of a para space could be > the most popular line space. + # So, probably need to find the max line space and max para space + # rather than simply the most popular. + # At the moment it only does this when spaces are close in popularity. + + # Find (next) most popular gap between lines + def find_line_space(skip): + scount, soffset = 0, 0 + for s in self.line_spaces: + if scount <= self.line_spaces[s] \ + and (skip <= 0 or self.line_spaces[s] < skip): + scount = self.line_spaces[s] + soffset = s + return scount, soffset + + # Find most popular top so that will be treated as top of page + tcount = 0 + for t in self.tops: + if tcount < self.tops[t]: + tcount = self.tops[t] + self.stats.top = t + + # Some PDFs have alternating pages with different lefts/indents. + # So, if the 2nd highest is 90% of the highest, assume it is a left. + # Same for the indents, assuming there were 2 lefts + + # Find most popular left so that will be treated as left of page + icount = 0 + for i in self.indents: + if icount < self.indents[i]: + icount = self.indents[i] + self.stats.left = i + + # Find second most popular left so that will be treated as indent + icount = 0 + for i in self.indents: + if i != self.stats.left and icount < self.indents[i]: + icount = self.indents[i] + self.stats.indent = i + + # For safety, check left and indent are in the right order + if self.stats.indent != 0 \ + and self.stats.left > self.stats.indent: + l = self.stats.left + self.stats.left = self.stats.indent + self.stats.indent = l + + # Now decide whether there are 2 similar, i.e. within 95% (arbitrary) + if self.stats.indent > 0 \ + and 100.0 * self.indents[self.stats.indent] / self.indents[self.stats.left] > 95.0: + self.stats.left2 = self.stats.indent + + # Find next most popular left so that will be treated as indent + icount = 0 + for i in self.indents: + if i != self.stats.left and i != self.stats.left2 and icount < self.indents[i]: + icount = self.indents[i] + self.stats.indent = i + + # And the last most popular left so that will be treated as indent2 + # Should check it is within 90%. What to do if not? + icount = 0 + for i in self.indents: + if i != self.stats.left and i != self.stats.left2 \ + and i != self.stats.indent and icount < self.indents[i]: + icount = self.indents[i] + self.stats.indent2 = i + + # And check indent and indent2 are in the right order + if self.stats.indent > self.stats.indent2: + l = self.stats.indent + self.stats.indent = self.stats.indent2 + self.stats.indent2 = l + + # Sort spaces into ascending order then loop through + # Lowest value(s) are line spacing, next are para + # Spaces not yet set up + self.stats.line_space = self.stats.para_space = -1.0 + # Find spacing values + # Find most popular space so that will be treated as line space + line_k = 0 + line_c = 0 + count = len(self.line_spaces) + while count > 0: + c, k = find_line_space(line_c) + if line_c <= 0: + line_c = c + if line_k <= 0: + line_k = k + elif abs(line_k - k) <= SAME_SPACE: + line_k = max(line_k, k) + line_c = min(line_c, c) + else: + break + count -= 1 + + # Get the next most popular gap + para_c = line_c-1 + para_k = 0 + count = len(self.line_spaces) + while count > 0: + c, k = find_line_space(para_c) + if para_k <= 0: + para_k = k + if abs(para_k - k) <= SAME_SPACE: + para_k = max(para_k, k) + para_c = min(para_c, c) + else: + break + count -= 1 + + # For safety, check in the right order + if line_k > para_k: + x = para_k + para_k = line_k + line_k = x + + self.stats.line_space = line_k + # Some docs have no great distinction for paragraphs + # Limit the size of the gap, or section breaks not found + if para_k > line_k * PARA_FACTOR: + self.stats.para_space = round(line_k * PARA_FACTOR) + else: + self.stats.para_space = para_k + + # Find most popular bottom so that will be treated as bottom of page + # Or the max bottom? Or the max used value within 10% of max value? + bcount = 0 + for b in self.bottoms: + if bcount < self.bottoms[b]: + #and b > self.stats.bottom*0.9: + bcount = self.bottoms[b] + #if b > self.stats.bottom: + self.stats.bottom = b + + # Find farthest right so that will be treated as page right + ## SHOULD DO RIGHT2 as well + rcount = 0 + for r in self.rights: + if rcount < r: + rcount = r + self.stats.right = r + + def find_header_footer(self): + # If requested, scan first few pages for possible headers/footers + + scan_count = 20 # Arbitrary + head_text = '' + head_match = 0 + head_match1 = 0 + head_page = 0 + head_skip = 0 + foot_text = '' + foot_match = 0 + foot_match1 = 0 + foot_page = 0 + foot_skip = 0 + pagenum_text = r'.*\d+\s+\w+\s+\d+.*' + + pages_to_scan = scan_count + # Note the a line may be in more than 1 part + # e.g. Page 1 of 6 ... DocName.pdf + # so merge first 2 lines if same top + # Ditto last 2 lines + # Maybe should do more than 2 parts + for page in self.pages: + if self.opts.pdf_header_skip < 0 and len(page.texts) > 0: + t = page.texts[0].text_as_string + if len(page.texts) > 1 and page.texts[0].top == page.texts[1].top: + t += page.texts[1].text_as_string + if len(head_text) == 0: + head_text = t + else: + if head_text == t: + head_match += 1 + if head_page == 0: + head_page = page.number + else: # Look for page count of format 'n xxx n' + if re.match(pagenum_text, t) is not None: + head_match1 += 1 + if head_page == 0: + head_page = page.number + + if self.opts.pdf_footer_skip < 0 and len(page.texts) > 0: + t = page.texts[-1].text_as_string + if len(page.texts) > 1 and page.texts[-1].top == page.texts[-2].top: + t += page.texts[-2].text_as_string + if len(foot_text) == 0: + foot_text = t + else: + if foot_text == t: + foot_match += 1 + if foot_page == 0: + foot_page = page.number + else: # Look for page count of format 'n xxx n' + if re.match(pagenum_text, t) is not None: + foot_match1 += 1 + if foot_page == 0: + foot_page = page.number + + pages_to_scan -= 1 + if pages_to_scan < 1: + break + + # How many pages have been scanned? + if pages_to_scan > scan_count-2: + # Doc is empty or 1 page. Can't decide on any skips + return + + if pages_to_scan > 0: + # Doc is shorter than scan_count + pages_to_scan = scan_count - pages_to_scan # Number scanned + else: + # All required pages scanned + pages_to_scan = scan_count + pages_to_scan /= 2 # Are at least half matching? + + if head_match > pages_to_scan or head_match1 > pages_to_scan: + t = self.pages[head_page].texts[0] + head_skip = t.top + t.height + 1 + + if foot_match > pages_to_scan or foot_match1 > pages_to_scan: + t = self.pages[foot_page].texts[-1] + foot_skip = t.top - 1 + + if head_skip > 0: + self.opts.pdf_header_skip = head_skip + if foot_skip > 0: + self.opts.pdf_footer_skip = foot_skip + + def remove_header_footer(self): + # Remove any header/footer lines from all pages + for page in self.pages: + # If a text is removed, we need to restart the loop or what was the next will be skipped + removed = 1 + while removed == 1: + removed = 0 + for t in page.texts: + if self.opts.pdf_header_skip > 0 and t.top < self.opts.pdf_header_skip \ + or self.opts.pdf_footer_skip > 0 and t.top > self.opts.pdf_footer_skip: + page.texts.remove(t) + removed = 1 + + def merge_pages(self, left_margin): + # Check for pages that can be merged + + # When merging pages, assume short last lines mean no merge + # BUT unfortunately there is no way to tell the difference + # between a continuation of a paragraph and a 'section break' + # if the previous page ends a sentence. + # Also long words can force a new line (at a new page) + # although the end of the previous is < this percent. + # Needs to find whether 1st word of 2nd page would fit on + # the last line of previous rather than the length of the last line. + # Pages can split early to avoid orphans. + ORPHAN_LINES = 2 + # First, find the minimum text top and the maximum text bottom + min_top = self.stats.top + max_bottom = self.stats.bottom + # Keep a note of the position of the final line on the merged page + save_bottom = 0 + # After merge, skip to this page + save_number = 0 + + # Now merge where bottom of one is within ORPHAN_LINES lines of max_bottom + # and top of next is within a line of min_top + # and margins correspond, and it's a normal paragraph + merge_done = True + while merge_done: + merge_done = False + merged_page = None + candidate = None + for page in self.pages: + if page.number < save_number: + next + if page.texts: + if candidate: + last_line = candidate.texts[-1] + merged_text = page.texts[0] + top = merged_text.top + # Should we check that the new line starts lower case? Doesn't cover all cases. + #and re.match('^[a-z]', merged_text.text_as_string) is not None + # How much space in characters was at the end of the last line? + # If the book is justified text, any space should mean end-of-para + # So, how to check for a justified book/page? + last_spare = (candidate.right_margin - last_line.right) / last_line.average_character_width + # How big is the first word on the next line? + merged_first = re.match(r'.+?(\s)+?', merged_text.text_as_string) + if merged_first is not None: + # First word length as float + merged_len = len(merged_first.group(0)) * 1.0 + else: + merged_len = merged_text.right + # Allow where the last line ends with or next line starts with lower case. + if re.match('[a-z, -]$', last_line.text_as_string) is not None \ + or re.match('^[a-z, -]', merged_text.text_as_string) is not None : + merged_len = merged_text.right + + # To use merged_len etc. + # Should not merge if speech where last ends 99 or 9 and next starts 66 or 6 + if top <= min_top + page.average_text_height \ + and merged_text.tag == 'p' \ + and 'href=' not in merged_text.raw \ + and merged_text.left < page.stats_left + merged_text.average_character_width \ + and not last_spare > merged_len \ + and not (re.match('.*(\u201d|”)$', last_line.text_as_string) is not None + and re.match('^(\u201c|“).*', merged_text.text_as_string) is not None): + merge_done = True + # We don't want to merge partial pages + # i.e. if this is the last line, preserve its top/bottom till after merge + if len(page.texts) == 1 : + save_bottom = merged_text.bottom + else: + save_bottom = 0.0 + + merged_text.top = candidate.texts[-1].top + page.average_text_height + merged_text.bottom = merged_text.top + page.average_text_height + candidate.texts.append(merged_text) + merged_page = page + break + candidate = None + last_line = page.texts[-1] + bottom = last_line.bottom + # Decide on whether merging is a good idea + # Non-indented paragraphs are a problem + if bottom >= max_bottom \ + - (ORPHAN_LINES*page.average_text_height) \ + - (ORPHAN_LINES*self.stats.line_space) \ + and (re.match('[a-z, ]$', last_line.text_as_string) is not None \ + or last_line.final_width > page.width*self.opts.unwrap_factor): + # or (last_line.right * 100.0 / page.right_margin > LAST_LINE_PERCENT)) : + candidate = page + else: + candidate = None + if merge_done: + merged_page.texts.remove(merged_text) + # We now need to skip to the next page number + # The current page can no longer have anything to merge + save_number = merged_page.number + 1 + # Re-calling coalesce_paras doesn't seem to work + candidate.texts[-2].coalesce(candidate.texts[-1], candidate.number, left_margin) + candidate.texts.remove(candidate.texts[-1]) + # Put back top/bottom after coalesce if final line + if save_bottom != 0.0 : + # Ignore top as that can confuse things where the 1st para of a page + # was merged with a previous. Keep the original top + #candidate.texts[-1].top = save_top + candidate.texts[-1].bottom = save_bottom + #candidate.coalesce_paras() + # Have we removed everything from this page (well, all texts) + if len(merged_page.texts) == 0: + self.pages.remove(merged_page) + + def linearize(self): self.elements = [] last_region = last_block = None @@ -685,14 +1707,28 @@ class PDFDocument: last_block = block last_region = region + def render(self): + #### Where does the title come from if not run from command line? + title = 'Converted Ebook' + if len(sys.argv) > 1: + title = sys.argv[1] + # Need to insert font info and styles html = ['', '', '', - 'PDF Reflow conversion', '', '', - '
'] - for elem in self.elements: - html.extend(elem.to_html()) + ''+title+'', + '', + '', ''] + for page in self.pages: + html.extend(page.to_html()) html += ['', ''] raw = ('\n'.join(html)).replace('', '') + raw = raw.replace('', '') + raw = raw.replace('', '') + raw = raw.replace('', '') + raw = raw.replace(' ', ' ') + raw = raw.replace(' ', ' ') + raw = raw.replace(' ', ' ') + raw = raw.replace(' ', ' ') with open('index.html', 'wb') as f: f.write(raw.encode('utf-8')) diff --git a/src/calibre/gui2/convert/pdf_input.py b/src/calibre/gui2/convert/pdf_input.py index 9c6f952de6..a431660978 100644 --- a/src/calibre/gui2/convert/pdf_input.py +++ b/src/calibre/gui2/convert/pdf_input.py @@ -18,8 +18,17 @@ class PluginWidget(Widget, Ui_Form): Widget.__init__(self, parent, OPTIONS['input']['pdf']) self.db, self.book_id = db, book_id self.initialize_options(get_option, get_help, db, book_id) + self.opt_new_pdf_engine.toggled.connect(self.update_engine_opts) + self.update_engine_opts() def set_value_handler(self, g, val): if val is None and isinstance(g, QDoubleSpinBox): g.setValue(0.0) return True + + def update_engine_opts(self): + enabled = self.opt_new_pdf_engine.isChecked() + self.opt_pdf_footer_skip.setEnabled(enabled) + self.opt_pdf_header_skip.setEnabled(enabled) + self.opt_pdf_header_regex.setEnabled(enabled) + self.opt_pdf_footer_regex.setEnabled(enabled) diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui index 3aa396f1ce..5d9acdd872 100644 --- a/src/calibre/gui2/convert/pdf_input.ui +++ b/src/calibre/gui2/convert/pdf_input.ui @@ -6,7 +6,7 @@ 0 0 - 400 + 413 300 @@ -24,19 +24,6 @@ - - - - Qt::Vertical - - - - 20 - 213 - - - - @@ -57,6 +44,118 @@ + + + + Remove headers at &top of page by: + + + opt_pdf_header_skip + + + + + + + Automatically + + + px + + + -1 + + + 99999 + + + -1 + + + + + + + Remove footers at &bottom of page by: + + + opt_pdf_footer_skip + + + + + + + Automatically + + + px + + + -1 + + + 99999 + + + -1 + + + + + + + Regular expression to remove &header at top of page: + + + opt_pdf_header_regex + + + + + + + true + + + + + + + Regular expression to remove &footer at bottom of page: + + + opt_pdf_footer_regex + + + + + + + true + + + + + + + Qt::Vertical + + + + 20 + 20 + + + + + + + + &New, experimental, PDF conversion Engine + + + diff --git a/src/pyj/book_list/conversion_widgets.pyj b/src/pyj/book_list/conversion_widgets.pyj index 63f915a614..e56e60cdf1 100644 --- a/src/pyj/book_list/conversion_widgets.pyj +++ b/src/pyj/book_list/conversion_widgets.pyj @@ -487,6 +487,11 @@ def pdf_input(container): container.appendChild(g) g.appendChild(float_spin('unwrap_factor', _('Line &un-wrapping factor:'), max=1, step=0.01)) g.appendChild(checkbox('no_images', _('No &images'))) + g.appendChild(checkbox('new_pdf_engine', _('New, experimental, PDF conversion &engine'))) + g.appendChild(int_spin('pdf_header_skip', _('Remove headers at &top of page by:'), min=-1, max=999999, step=1)) + g.appendChild(int_spin('pdf_footer_skip', _('Remove footers at &bottom of page by:'), min=-1, max=999999, step=1)) + g.appendChild(lineedit('pdf_header_regex', _('Regular expression to remove &header at top of page:'))) + g.appendChild(lineedit('pdf_footer_regex', _('Regular expression to remove &footer at bottom of page:'))) # }}} # RTF Input {{{