PDF Input: Automatic header/footer detection and removal

A new, dedicated PDF Input engine for calibre that implements
automatic detection of headers and footers based on document analysis.
The new engine is the default to go back to using the old engine, select
it in the PDF Input section of the Conversion dialog. See #2076346 (PDF conversion new engine does not work)
This commit is contained in:
Kovid Goyal 2024-10-17 11:11:09 +05:30
parent 16cb7d3083
commit 8359e89cac
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 107 additions and 91 deletions

View File

@ -221,7 +221,7 @@ OPTIONS = {
'fb2': ('no_inline_fb2_toc',),
'pdf': ('no_images', 'unwrap_factor', 'new_pdf_engine', 'pdf_header_skip', 'pdf_footer_skip', 'pdf_header_regex', 'pdf_footer_regex'),
'pdf': ('no_images', 'unwrap_factor', 'pdf_engine', 'pdf_header_skip', 'pdf_footer_skip', 'pdf_header_regex', 'pdf_footer_regex'),
'rtf': ('ignore_wmf',),

View File

@ -7,6 +7,8 @@ import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from polyglot.builtins import as_bytes
ENGINES = 'calibre', 'pdftohtml'
class PDFInput(InputFormatPlugin):
@ -23,8 +25,10 @@ class PDFInput(InputFormatPlugin):
help=_('Scale used to determine the length at which a line should '
'be unwrapped. Valid values are a decimal between 0 and 1. The '
'default is 0.45, just below the median line length.')),
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
help=_('Use the new, experimental, PDF conversion engine.')),
OptionRecommendation(name='pdf_engine', recommended_value='calibre', choices=('calibre', 'pdftohtml'),
help=_('The PDF engine to use, the "calibre" engine is recommended as it has automatic header and footer removal.'
' Choices: {}'
).format(', '.join(ENGINES))),
OptionRecommendation(name='pdf_header_skip', recommended_value=-1,
help=_('Skip everything to the specified number of pixels at the top of a page.'
' Negative numbers mean auto-detect and remove headers, zero means do not remove headers and positive numbers'
@ -35,14 +39,14 @@ class PDFInput(InputFormatPlugin):
help=_('Skip everything to the specified number of pixels at the bottom of a page.'
' Negative numbers mean auto-detect and remove footers, zero means do not remove footers and positive numbers'
' mean remove footers that appear below that many pixels from the bottom of the page. Works only'
' with the new PDF engine.'
' with the calibre PDF engine.'
)),
OptionRecommendation(name='pdf_header_regex', recommended_value='',
help=_('Regular expression to remove lines at the top of a page. '
'This only looks at the first line of a page and works only with the new PDF engine.')),
'This only looks at the first line of a page and works only with the calibre PDF engine.')),
OptionRecommendation(name='pdf_footer_regex', recommended_value='',
help=_('Regular expression to remove lines at the bottom of a page. '
'This only looks at the last line of a page and works only with the new PDF engine.')),
'This only looks at the last line of a page and works only with the calibre PDF engine.')),
}
def convert(self, stream, options, file_ext, log,
@ -53,7 +57,7 @@ class PDFInput(InputFormatPlugin):
log.debug('Converting file to html...')
# The main html file will be named index.html
self.opts, self.log = options, log
if options.new_pdf_engine:
if options.pdf_engine == 'calibre':
from calibre.ebooks.pdf.reflow import PDFDocument
from calibre.utils.cleantext import clean_ascii_chars
pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True)

View File

@ -17,17 +17,23 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, OPTIONS['input']['pdf'])
self.db, self.book_id = db, book_id
from calibre.ebooks.conversion.plugins.pdf_input import ENGINES
self.opt_pdf_engine.addItems(ENGINES)
self.initialize_options(get_option, get_help, db, book_id)
self.opt_new_pdf_engine.toggled.connect(self.update_engine_opts)
self.opt_pdf_engine.currentIndexChanged.connect(self.update_engine_opts)
self.update_engine_opts()
def set_value_handler(self, g, val):
if val is None and isinstance(g, QDoubleSpinBox):
g.setValue(0.0)
return True
if g is self.opt_pdf_engine:
idx = g.findText(val)
if idx > -1:
g.setCurrentIndex(idx)
def update_engine_opts(self):
enabled = self.opt_new_pdf_engine.isChecked()
enabled = self.opt_pdf_engine.currentText() == 'calibre'
self.opt_pdf_footer_skip.setEnabled(enabled)
self.opt_pdf_header_skip.setEnabled(enabled)
self.opt_pdf_header_regex.setEnabled(enabled)

View File

@ -14,36 +14,6 @@
<string>Form</string>
</property>
<layout class="QGridLayout" name="gridLayout">
<item row="0" column="0">
<widget class="QLabel" name="label_2">
<property name="text">
<string>Line &amp;un-wrapping factor:</string>
</property>
<property name="buddy">
<cstring>opt_unwrap_factor</cstring>
</property>
</widget>
</item>
<item row="0" column="1">
<widget class="QDoubleSpinBox" name="opt_unwrap_factor">
<property name="maximum">
<double>1.000000000000000</double>
</property>
<property name="singleStep">
<double>0.010000000000000</double>
</property>
<property name="value">
<double>0.450000000000000</double>
</property>
</widget>
</item>
<item row="1" column="0">
<widget class="QCheckBox" name="opt_no_images">
<property name="text">
<string>No &amp;images</string>
</property>
</widget>
</item>
<item row="3" column="0">
<widget class="QLabel" name="label_t">
<property name="text">
@ -54,35 +24,6 @@
</property>
</widget>
</item>
<item row="3" column="1">
<widget class="QSpinBox" name="opt_pdf_header_skip">
<property name="specialValueText">
<string>Automatically</string>
</property>
<property name="suffix">
<string> px</string>
</property>
<property name="minimum">
<number>-1</number>
</property>
<property name="maximum">
<number>99999</number>
</property>
<property name="value">
<number>-1</number>
</property>
</widget>
</item>
<item row="4" column="0">
<widget class="QLabel" name="label_b">
<property name="text">
<string>Remove footers at &amp;bottom of page by:</string>
</property>
<property name="buddy">
<cstring>opt_pdf_footer_skip</cstring>
</property>
</widget>
</item>
<item row="4" column="1">
<widget class="QSpinBox" name="opt_pdf_footer_skip">
<property name="specialValueText">
@ -102,15 +43,8 @@
</property>
</widget>
</item>
<item row="5" column="0">
<widget class="QLabel" name="label_rt">
<property name="text">
<string>Regular expression to remove &amp;header at top of page:</string>
</property>
<property name="buddy">
<cstring>opt_pdf_header_regex</cstring>
</property>
</widget>
<item row="2" column="1">
<widget class="QComboBox" name="opt_pdf_engine"/>
</item>
<item row="5" column="1">
<widget class="QLineEdit" name="opt_pdf_header_regex">
@ -119,16 +53,6 @@
</property>
</widget>
</item>
<item row="6" column="0">
<widget class="QLabel" name="label_rb">
<property name="text">
<string>Regular expression to remove &amp;footer at bottom of page:</string>
</property>
<property name="buddy">
<cstring>opt_pdf_footer_regex</cstring>
</property>
</widget>
</item>
<item row="6" column="1">
<widget class="QLineEdit" name="opt_pdf_footer_regex">
<property name="clearButtonEnabled">
@ -149,10 +73,92 @@
</property>
</spacer>
</item>
<item row="2" column="0" colspan="2">
<widget class="QCheckBox" name="opt_new_pdf_engine">
<item row="5" column="0">
<widget class="QLabel" name="label_rt">
<property name="text">
<string>New, experimental, PDF conversion &amp;engine</string>
<string>Regular expression to remove &amp;header at top of page:</string>
</property>
<property name="buddy">
<cstring>opt_pdf_header_regex</cstring>
</property>
</widget>
</item>
<item row="1" column="0">
<widget class="QCheckBox" name="opt_no_images">
<property name="text">
<string>No &amp;images</string>
</property>
</widget>
</item>
<item row="6" column="0">
<widget class="QLabel" name="label_rb">
<property name="text">
<string>Regular expression to remove &amp;footer at bottom of page:</string>
</property>
<property name="buddy">
<cstring>opt_pdf_footer_regex</cstring>
</property>
</widget>
</item>
<item row="0" column="1">
<widget class="QDoubleSpinBox" name="opt_unwrap_factor">
<property name="maximum">
<double>1.000000000000000</double>
</property>
<property name="singleStep">
<double>0.010000000000000</double>
</property>
<property name="value">
<double>0.450000000000000</double>
</property>
</widget>
</item>
<item row="4" column="0">
<widget class="QLabel" name="label_b">
<property name="text">
<string>Remove footers at &amp;bottom of page by:</string>
</property>
<property name="buddy">
<cstring>opt_pdf_footer_skip</cstring>
</property>
</widget>
</item>
<item row="2" column="0">
<widget class="QLabel" name="label">
<property name="text">
<string>PDF &amp;engine:</string>
</property>
<property name="buddy">
<cstring>opt_pdf_engine</cstring>
</property>
</widget>
</item>
<item row="0" column="0">
<widget class="QLabel" name="label_2">
<property name="text">
<string>Line &amp;un-wrapping factor:</string>
</property>
<property name="buddy">
<cstring>opt_unwrap_factor</cstring>
</property>
</widget>
</item>
<item row="3" column="1">
<widget class="QSpinBox" name="opt_pdf_header_skip">
<property name="specialValueText">
<string>Automatically</string>
</property>
<property name="suffix">
<string> px</string>
</property>
<property name="minimum">
<number>-1</number>
</property>
<property name="maximum">
<number>99999</number>
</property>
<property name="value">
<number>-1</number>
</property>
</widget>
</item>

View File

@ -487,7 +487,7 @@ def pdf_input(container):
container.appendChild(g)
g.appendChild(float_spin('unwrap_factor', _('Line &un-wrapping factor:'), max=1, step=0.01))
g.appendChild(checkbox('no_images', _('No &images')))
g.appendChild(checkbox('new_pdf_engine', _('New, experimental, PDF conversion &engine')))
g.appendChild(choices('pdf_engine', _('PDF &engine:'), {'calibre': 'calibre', 'pdftohtml': 'pdftohtml'}))
g.appendChild(int_spin('pdf_header_skip', _('Remove headers at &top of page by:'), min=-1, max=999999, step=1))
g.appendChild(int_spin('pdf_footer_skip', _('Remove footers at &bottom of page by:'), min=-1, max=999999, step=1))
g.appendChild(lineedit('pdf_header_regex', _('Regular expression to remove &header at top of page:')))