New engine to convert PDF files with support for automatic header/footer removal. Fixes #2076346 [PDF conversion new engine does not work](https://bugs.launchpad.net/calibre/+bug/2076346)

This commit is contained in:
Kovid Goyal 2024-09-12 12:52:34 +05:30
parent f4a4f19786
commit a8fb98459b
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
6 changed files with 1288 additions and 126 deletions

View File

@ -221,7 +221,7 @@ OPTIONS = {
'fb2': ('no_inline_fb2_toc',),
'pdf': ('no_images', 'unwrap_factor'),
'pdf': ('no_images', 'unwrap_factor', 'new_pdf_engine', 'pdf_header_skip', 'pdf_footer_skip', 'pdf_header_regex', 'pdf_footer_regex'),
'rtf': ('ignore_wmf',),

View File

@ -11,7 +11,7 @@ from polyglot.builtins import as_bytes
class PDFInput(InputFormatPlugin):
name = 'PDF Input'
author = 'Kovid Goyal and John Schember'
author = 'Kovid Goyal and John Schember and Alan Pettigrew'
description = _('Convert PDF files to HTML')
file_types = {'pdf'}
commit_name = 'pdf_input'
@ -24,20 +24,27 @@ class PDFInput(InputFormatPlugin):
'be unwrapped. Valid values are a decimal between 0 and 1. The '
'default is 0.45, just below the median line length.')),
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
help=_('Use the new PDF conversion engine. Currently not operational.'))
help=_('Use the new, experimental, PDF conversion engine.')),
OptionRecommendation(name='pdf_header_skip', recommended_value=-1,
help=_('Skip everything to the specified number pixels at the top of a page.'
' Negative numbers mean auto-detect and remove headers, zero means do not remove headers and positive numbers'
' mean remove headers that appear above that many pixels from the top of the page. Works only'
' with the new PDF engine.'
)),
OptionRecommendation(name='pdf_footer_skip', recommended_value=-1,
help=_('Skip everything to the specified number of pixels at the bottom of a page.'
' Negative numbers mean auto-detect and remove footers, zero means do not remove footers and positive numbers'
' mean remove footers that appear below that many pixels from the bottom of the page. Works only'
' with the new PDF engine.'
)),
OptionRecommendation(name='pdf_header_regex', recommended_value='',
help=_('Regular expression to remove lines at the top of a page. '
'This only looks at the first line of a page and works only with the new PDF engine.')),
OptionRecommendation(name='pdf_footer_regex', recommended_value='',
help=_('Regular expression to remove lines at the bottom of a page. '
'This only looks at the last line of a page and works only with the new PDF engine.')),
}
def convert_new(self, stream, accelerators):
from calibre.ebooks.pdf.pdftohtml import pdftohtml
from calibre.ebooks.pdf.reflow import PDFDocument
from calibre.utils.cleantext import clean_ascii_chars
pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True)
with open('index.xml', 'rb') as f:
xml = clean_ascii_chars(f.read())
PDFDocument(xml, self.opts, self.log)
return os.path.join(os.getcwd(), 'metadata.opf')
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.metadata.opf2 import OPFCreator
@ -47,8 +54,14 @@ class PDFInput(InputFormatPlugin):
# The main html file will be named index.html
self.opts, self.log = options, log
if options.new_pdf_engine:
return self.convert_new(stream, accelerators)
pdftohtml(os.getcwd(), stream.name, options.no_images)
from calibre.ebooks.pdf.reflow import PDFDocument
from calibre.utils.cleantext import clean_ascii_chars
pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True)
with open(u'index.xml', 'rb') as f:
xml = clean_ascii_chars(f.read())
PDFDocument(xml, self.opts, self.log)
else:
pdftohtml(os.getcwd(), stream.name, options.no_images)
from calibre.ebooks.metadata.meta import get_metadata
log.debug('Retrieving document metadata...')

File diff suppressed because it is too large Load Diff

View File

@ -18,8 +18,17 @@ class PluginWidget(Widget, Ui_Form):
Widget.__init__(self, parent, OPTIONS['input']['pdf'])
self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id)
self.opt_new_pdf_engine.toggled.connect(self.update_engine_opts)
self.update_engine_opts()
def set_value_handler(self, g, val):
if val is None and isinstance(g, QDoubleSpinBox):
g.setValue(0.0)
return True
def update_engine_opts(self):
enabled = self.opt_new_pdf_engine.isChecked()
self.opt_pdf_footer_skip.setEnabled(enabled)
self.opt_pdf_header_skip.setEnabled(enabled)
self.opt_pdf_header_regex.setEnabled(enabled)
self.opt_pdf_footer_regex.setEnabled(enabled)

View File

@ -6,7 +6,7 @@
<rect>
<x>0</x>
<y>0</y>
<width>400</width>
<width>413</width>
<height>300</height>
</rect>
</property>
@ -24,19 +24,6 @@
</property>
</widget>
</item>
<item row="2" column="0">
<spacer name="verticalSpacer">
<property name="orientation">
<enum>Qt::Vertical</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>20</width>
<height>213</height>
</size>
</property>
</spacer>
</item>
<item row="0" column="1">
<widget class="QDoubleSpinBox" name="opt_unwrap_factor">
<property name="maximum">
@ -57,6 +44,118 @@
</property>
</widget>
</item>
<item row="3" column="0">
<widget class="QLabel" name="label_t">
<property name="text">
<string>Remove headers at &amp;top of page by:</string>
</property>
<property name="buddy">
<cstring>opt_pdf_header_skip</cstring>
</property>
</widget>
</item>
<item row="3" column="1">
<widget class="QSpinBox" name="opt_pdf_header_skip">
<property name="specialValueText">
<string>Automatically</string>
</property>
<property name="suffix">
<string> px</string>
</property>
<property name="minimum">
<number>-1</number>
</property>
<property name="maximum">
<number>99999</number>
</property>
<property name="value">
<number>-1</number>
</property>
</widget>
</item>
<item row="4" column="0">
<widget class="QLabel" name="label_b">
<property name="text">
<string>Remove footers at &amp;bottom of page by:</string>
</property>
<property name="buddy">
<cstring>opt_pdf_footer_skip</cstring>
</property>
</widget>
</item>
<item row="4" column="1">
<widget class="QSpinBox" name="opt_pdf_footer_skip">
<property name="specialValueText">
<string>Automatically</string>
</property>
<property name="suffix">
<string> px</string>
</property>
<property name="minimum">
<number>-1</number>
</property>
<property name="maximum">
<number>99999</number>
</property>
<property name="value">
<number>-1</number>
</property>
</widget>
</item>
<item row="5" column="0">
<widget class="QLabel" name="label_rt">
<property name="text">
<string>Regular expression to remove &amp;header at top of page:</string>
</property>
<property name="buddy">
<cstring>opt_pdf_header_regex</cstring>
</property>
</widget>
</item>
<item row="5" column="1">
<widget class="QLineEdit" name="opt_pdf_header_regex">
<property name="clearButtonEnabled">
<bool>true</bool>
</property>
</widget>
</item>
<item row="6" column="0">
<widget class="QLabel" name="label_rb">
<property name="text">
<string>Regular expression to remove &amp;footer at bottom of page:</string>
</property>
<property name="buddy">
<cstring>opt_pdf_footer_regex</cstring>
</property>
</widget>
</item>
<item row="6" column="1">
<widget class="QLineEdit" name="opt_pdf_footer_regex">
<property name="clearButtonEnabled">
<bool>true</bool>
</property>
</widget>
</item>
<item row="7" column="0">
<spacer name="verticalSpacer">
<property name="orientation">
<enum>Qt::Vertical</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>20</width>
<height>20</height>
</size>
</property>
</spacer>
</item>
<item row="2" column="0" colspan="2">
<widget class="QCheckBox" name="opt_new_pdf_engine">
<property name="text">
<string>&amp;New, experimental, PDF conversion Engine</string>
</property>
</widget>
</item>
</layout>
</widget>
<resources/>

View File

@ -487,6 +487,11 @@ def pdf_input(container):
container.appendChild(g)
g.appendChild(float_spin('unwrap_factor', _('Line &un-wrapping factor:'), max=1, step=0.01))
g.appendChild(checkbox('no_images', _('No &images')))
g.appendChild(checkbox('new_pdf_engine', _('New, experimental, PDF conversion &engine')))
g.appendChild(int_spin('pdf_header_skip', _('Remove headers at &top of page by:'), min=-1, max=999999, step=1))
g.appendChild(int_spin('pdf_footer_skip', _('Remove footers at &bottom of page by:'), min=-1, max=999999, step=1))
g.appendChild(lineedit('pdf_header_regex', _('Regular expression to remove &header at top of page:')))
g.appendChild(lineedit('pdf_footer_regex', _('Regular expression to remove &footer at bottom of page:')))
# }}}
# RTF Input {{{