mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
PDF Input: Automatic header/footer detection and removal
A new, dedicated PDF Input engine for calibre that implements automatic detection of headers and footers based on document analysis. The new engine is the default to go back to using the old engine, select it in the PDF Input section of the Conversion dialog. See #2076346 (PDF conversion new engine does not work)
This commit is contained in:
parent
16cb7d3083
commit
8359e89cac
@ -221,7 +221,7 @@ OPTIONS = {
|
|||||||
|
|
||||||
'fb2': ('no_inline_fb2_toc',),
|
'fb2': ('no_inline_fb2_toc',),
|
||||||
|
|
||||||
'pdf': ('no_images', 'unwrap_factor', 'new_pdf_engine', 'pdf_header_skip', 'pdf_footer_skip', 'pdf_header_regex', 'pdf_footer_regex'),
|
'pdf': ('no_images', 'unwrap_factor', 'pdf_engine', 'pdf_header_skip', 'pdf_footer_skip', 'pdf_header_regex', 'pdf_footer_regex'),
|
||||||
|
|
||||||
'rtf': ('ignore_wmf',),
|
'rtf': ('ignore_wmf',),
|
||||||
|
|
||||||
|
@ -7,6 +7,8 @@ import os
|
|||||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||||
from polyglot.builtins import as_bytes
|
from polyglot.builtins import as_bytes
|
||||||
|
|
||||||
|
ENGINES = 'calibre', 'pdftohtml'
|
||||||
|
|
||||||
|
|
||||||
class PDFInput(InputFormatPlugin):
|
class PDFInput(InputFormatPlugin):
|
||||||
|
|
||||||
@ -23,8 +25,10 @@ class PDFInput(InputFormatPlugin):
|
|||||||
help=_('Scale used to determine the length at which a line should '
|
help=_('Scale used to determine the length at which a line should '
|
||||||
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
||||||
'default is 0.45, just below the median line length.')),
|
'default is 0.45, just below the median line length.')),
|
||||||
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
|
OptionRecommendation(name='pdf_engine', recommended_value='calibre', choices=('calibre', 'pdftohtml'),
|
||||||
help=_('Use the new, experimental, PDF conversion engine.')),
|
help=_('The PDF engine to use, the "calibre" engine is recommended as it has automatic header and footer removal.'
|
||||||
|
' Choices: {}'
|
||||||
|
).format(', '.join(ENGINES))),
|
||||||
OptionRecommendation(name='pdf_header_skip', recommended_value=-1,
|
OptionRecommendation(name='pdf_header_skip', recommended_value=-1,
|
||||||
help=_('Skip everything to the specified number of pixels at the top of a page.'
|
help=_('Skip everything to the specified number of pixels at the top of a page.'
|
||||||
' Negative numbers mean auto-detect and remove headers, zero means do not remove headers and positive numbers'
|
' Negative numbers mean auto-detect and remove headers, zero means do not remove headers and positive numbers'
|
||||||
@ -35,14 +39,14 @@ class PDFInput(InputFormatPlugin):
|
|||||||
help=_('Skip everything to the specified number of pixels at the bottom of a page.'
|
help=_('Skip everything to the specified number of pixels at the bottom of a page.'
|
||||||
' Negative numbers mean auto-detect and remove footers, zero means do not remove footers and positive numbers'
|
' Negative numbers mean auto-detect and remove footers, zero means do not remove footers and positive numbers'
|
||||||
' mean remove footers that appear below that many pixels from the bottom of the page. Works only'
|
' mean remove footers that appear below that many pixels from the bottom of the page. Works only'
|
||||||
' with the new PDF engine.'
|
' with the calibre PDF engine.'
|
||||||
)),
|
)),
|
||||||
OptionRecommendation(name='pdf_header_regex', recommended_value='',
|
OptionRecommendation(name='pdf_header_regex', recommended_value='',
|
||||||
help=_('Regular expression to remove lines at the top of a page. '
|
help=_('Regular expression to remove lines at the top of a page. '
|
||||||
'This only looks at the first line of a page and works only with the new PDF engine.')),
|
'This only looks at the first line of a page and works only with the calibre PDF engine.')),
|
||||||
OptionRecommendation(name='pdf_footer_regex', recommended_value='',
|
OptionRecommendation(name='pdf_footer_regex', recommended_value='',
|
||||||
help=_('Regular expression to remove lines at the bottom of a page. '
|
help=_('Regular expression to remove lines at the bottom of a page. '
|
||||||
'This only looks at the last line of a page and works only with the new PDF engine.')),
|
'This only looks at the last line of a page and works only with the calibre PDF engine.')),
|
||||||
}
|
}
|
||||||
|
|
||||||
def convert(self, stream, options, file_ext, log,
|
def convert(self, stream, options, file_ext, log,
|
||||||
@ -53,7 +57,7 @@ class PDFInput(InputFormatPlugin):
|
|||||||
log.debug('Converting file to html...')
|
log.debug('Converting file to html...')
|
||||||
# The main html file will be named index.html
|
# The main html file will be named index.html
|
||||||
self.opts, self.log = options, log
|
self.opts, self.log = options, log
|
||||||
if options.new_pdf_engine:
|
if options.pdf_engine == 'calibre':
|
||||||
from calibre.ebooks.pdf.reflow import PDFDocument
|
from calibre.ebooks.pdf.reflow import PDFDocument
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True)
|
pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True)
|
||||||
|
@ -17,17 +17,23 @@ class PluginWidget(Widget, Ui_Form):
|
|||||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||||
Widget.__init__(self, parent, OPTIONS['input']['pdf'])
|
Widget.__init__(self, parent, OPTIONS['input']['pdf'])
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
|
from calibre.ebooks.conversion.plugins.pdf_input import ENGINES
|
||||||
|
self.opt_pdf_engine.addItems(ENGINES)
|
||||||
self.initialize_options(get_option, get_help, db, book_id)
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
self.opt_new_pdf_engine.toggled.connect(self.update_engine_opts)
|
self.opt_pdf_engine.currentIndexChanged.connect(self.update_engine_opts)
|
||||||
self.update_engine_opts()
|
self.update_engine_opts()
|
||||||
|
|
||||||
def set_value_handler(self, g, val):
|
def set_value_handler(self, g, val):
|
||||||
if val is None and isinstance(g, QDoubleSpinBox):
|
if val is None and isinstance(g, QDoubleSpinBox):
|
||||||
g.setValue(0.0)
|
g.setValue(0.0)
|
||||||
return True
|
return True
|
||||||
|
if g is self.opt_pdf_engine:
|
||||||
|
idx = g.findText(val)
|
||||||
|
if idx > -1:
|
||||||
|
g.setCurrentIndex(idx)
|
||||||
|
|
||||||
def update_engine_opts(self):
|
def update_engine_opts(self):
|
||||||
enabled = self.opt_new_pdf_engine.isChecked()
|
enabled = self.opt_pdf_engine.currentText() == 'calibre'
|
||||||
self.opt_pdf_footer_skip.setEnabled(enabled)
|
self.opt_pdf_footer_skip.setEnabled(enabled)
|
||||||
self.opt_pdf_header_skip.setEnabled(enabled)
|
self.opt_pdf_header_skip.setEnabled(enabled)
|
||||||
self.opt_pdf_header_regex.setEnabled(enabled)
|
self.opt_pdf_header_regex.setEnabled(enabled)
|
||||||
|
@ -14,36 +14,6 @@
|
|||||||
<string>Form</string>
|
<string>Form</string>
|
||||||
</property>
|
</property>
|
||||||
<layout class="QGridLayout" name="gridLayout">
|
<layout class="QGridLayout" name="gridLayout">
|
||||||
<item row="0" column="0">
|
|
||||||
<widget class="QLabel" name="label_2">
|
|
||||||
<property name="text">
|
|
||||||
<string>Line &un-wrapping factor:</string>
|
|
||||||
</property>
|
|
||||||
<property name="buddy">
|
|
||||||
<cstring>opt_unwrap_factor</cstring>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="0" column="1">
|
|
||||||
<widget class="QDoubleSpinBox" name="opt_unwrap_factor">
|
|
||||||
<property name="maximum">
|
|
||||||
<double>1.000000000000000</double>
|
|
||||||
</property>
|
|
||||||
<property name="singleStep">
|
|
||||||
<double>0.010000000000000</double>
|
|
||||||
</property>
|
|
||||||
<property name="value">
|
|
||||||
<double>0.450000000000000</double>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="1" column="0">
|
|
||||||
<widget class="QCheckBox" name="opt_no_images">
|
|
||||||
<property name="text">
|
|
||||||
<string>No &images</string>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="3" column="0">
|
<item row="3" column="0">
|
||||||
<widget class="QLabel" name="label_t">
|
<widget class="QLabel" name="label_t">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
@ -54,35 +24,6 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="3" column="1">
|
|
||||||
<widget class="QSpinBox" name="opt_pdf_header_skip">
|
|
||||||
<property name="specialValueText">
|
|
||||||
<string>Automatically</string>
|
|
||||||
</property>
|
|
||||||
<property name="suffix">
|
|
||||||
<string> px</string>
|
|
||||||
</property>
|
|
||||||
<property name="minimum">
|
|
||||||
<number>-1</number>
|
|
||||||
</property>
|
|
||||||
<property name="maximum">
|
|
||||||
<number>99999</number>
|
|
||||||
</property>
|
|
||||||
<property name="value">
|
|
||||||
<number>-1</number>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="4" column="0">
|
|
||||||
<widget class="QLabel" name="label_b">
|
|
||||||
<property name="text">
|
|
||||||
<string>Remove footers at &bottom of page by:</string>
|
|
||||||
</property>
|
|
||||||
<property name="buddy">
|
|
||||||
<cstring>opt_pdf_footer_skip</cstring>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="4" column="1">
|
<item row="4" column="1">
|
||||||
<widget class="QSpinBox" name="opt_pdf_footer_skip">
|
<widget class="QSpinBox" name="opt_pdf_footer_skip">
|
||||||
<property name="specialValueText">
|
<property name="specialValueText">
|
||||||
@ -102,15 +43,8 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="5" column="0">
|
<item row="2" column="1">
|
||||||
<widget class="QLabel" name="label_rt">
|
<widget class="QComboBox" name="opt_pdf_engine"/>
|
||||||
<property name="text">
|
|
||||||
<string>Regular expression to remove &header at top of page:</string>
|
|
||||||
</property>
|
|
||||||
<property name="buddy">
|
|
||||||
<cstring>opt_pdf_header_regex</cstring>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
</item>
|
||||||
<item row="5" column="1">
|
<item row="5" column="1">
|
||||||
<widget class="QLineEdit" name="opt_pdf_header_regex">
|
<widget class="QLineEdit" name="opt_pdf_header_regex">
|
||||||
@ -119,16 +53,6 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="6" column="0">
|
|
||||||
<widget class="QLabel" name="label_rb">
|
|
||||||
<property name="text">
|
|
||||||
<string>Regular expression to remove &footer at bottom of page:</string>
|
|
||||||
</property>
|
|
||||||
<property name="buddy">
|
|
||||||
<cstring>opt_pdf_footer_regex</cstring>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="6" column="1">
|
<item row="6" column="1">
|
||||||
<widget class="QLineEdit" name="opt_pdf_footer_regex">
|
<widget class="QLineEdit" name="opt_pdf_footer_regex">
|
||||||
<property name="clearButtonEnabled">
|
<property name="clearButtonEnabled">
|
||||||
@ -149,10 +73,92 @@
|
|||||||
</property>
|
</property>
|
||||||
</spacer>
|
</spacer>
|
||||||
</item>
|
</item>
|
||||||
<item row="2" column="0" colspan="2">
|
<item row="5" column="0">
|
||||||
<widget class="QCheckBox" name="opt_new_pdf_engine">
|
<widget class="QLabel" name="label_rt">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>New, experimental, PDF conversion &engine</string>
|
<string>Regular expression to remove &header at top of page:</string>
|
||||||
|
</property>
|
||||||
|
<property name="buddy">
|
||||||
|
<cstring>opt_pdf_header_regex</cstring>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="1" column="0">
|
||||||
|
<widget class="QCheckBox" name="opt_no_images">
|
||||||
|
<property name="text">
|
||||||
|
<string>No &images</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="6" column="0">
|
||||||
|
<widget class="QLabel" name="label_rb">
|
||||||
|
<property name="text">
|
||||||
|
<string>Regular expression to remove &footer at bottom of page:</string>
|
||||||
|
</property>
|
||||||
|
<property name="buddy">
|
||||||
|
<cstring>opt_pdf_footer_regex</cstring>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="0" column="1">
|
||||||
|
<widget class="QDoubleSpinBox" name="opt_unwrap_factor">
|
||||||
|
<property name="maximum">
|
||||||
|
<double>1.000000000000000</double>
|
||||||
|
</property>
|
||||||
|
<property name="singleStep">
|
||||||
|
<double>0.010000000000000</double>
|
||||||
|
</property>
|
||||||
|
<property name="value">
|
||||||
|
<double>0.450000000000000</double>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="4" column="0">
|
||||||
|
<widget class="QLabel" name="label_b">
|
||||||
|
<property name="text">
|
||||||
|
<string>Remove footers at &bottom of page by:</string>
|
||||||
|
</property>
|
||||||
|
<property name="buddy">
|
||||||
|
<cstring>opt_pdf_footer_skip</cstring>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="2" column="0">
|
||||||
|
<widget class="QLabel" name="label">
|
||||||
|
<property name="text">
|
||||||
|
<string>PDF &engine:</string>
|
||||||
|
</property>
|
||||||
|
<property name="buddy">
|
||||||
|
<cstring>opt_pdf_engine</cstring>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="0" column="0">
|
||||||
|
<widget class="QLabel" name="label_2">
|
||||||
|
<property name="text">
|
||||||
|
<string>Line &un-wrapping factor:</string>
|
||||||
|
</property>
|
||||||
|
<property name="buddy">
|
||||||
|
<cstring>opt_unwrap_factor</cstring>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="3" column="1">
|
||||||
|
<widget class="QSpinBox" name="opt_pdf_header_skip">
|
||||||
|
<property name="specialValueText">
|
||||||
|
<string>Automatically</string>
|
||||||
|
</property>
|
||||||
|
<property name="suffix">
|
||||||
|
<string> px</string>
|
||||||
|
</property>
|
||||||
|
<property name="minimum">
|
||||||
|
<number>-1</number>
|
||||||
|
</property>
|
||||||
|
<property name="maximum">
|
||||||
|
<number>99999</number>
|
||||||
|
</property>
|
||||||
|
<property name="value">
|
||||||
|
<number>-1</number>
|
||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
|
@ -487,7 +487,7 @@ def pdf_input(container):
|
|||||||
container.appendChild(g)
|
container.appendChild(g)
|
||||||
g.appendChild(float_spin('unwrap_factor', _('Line &un-wrapping factor:'), max=1, step=0.01))
|
g.appendChild(float_spin('unwrap_factor', _('Line &un-wrapping factor:'), max=1, step=0.01))
|
||||||
g.appendChild(checkbox('no_images', _('No &images')))
|
g.appendChild(checkbox('no_images', _('No &images')))
|
||||||
g.appendChild(checkbox('new_pdf_engine', _('New, experimental, PDF conversion &engine')))
|
g.appendChild(choices('pdf_engine', _('PDF &engine:'), {'calibre': 'calibre', 'pdftohtml': 'pdftohtml'}))
|
||||||
g.appendChild(int_spin('pdf_header_skip', _('Remove headers at &top of page by:'), min=-1, max=999999, step=1))
|
g.appendChild(int_spin('pdf_header_skip', _('Remove headers at &top of page by:'), min=-1, max=999999, step=1))
|
||||||
g.appendChild(int_spin('pdf_footer_skip', _('Remove footers at &bottom of page by:'), min=-1, max=999999, step=1))
|
g.appendChild(int_spin('pdf_footer_skip', _('Remove footers at &bottom of page by:'), min=-1, max=999999, step=1))
|
||||||
g.appendChild(lineedit('pdf_header_regex', _('Regular expression to remove &header at top of page:')))
|
g.appendChild(lineedit('pdf_header_regex', _('Regular expression to remove &header at top of page:')))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user