mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
PDF Input: Automatic header/footer detection and removal
A new, dedicated PDF Input engine for calibre that implements automatic detection of headers and footers based on document analysis. The new engine is the default to go back to using the old engine, select it in the PDF Input section of the Conversion dialog. See #2076346 (PDF conversion new engine does not work)
This commit is contained in:
parent
16cb7d3083
commit
8359e89cac
@ -221,7 +221,7 @@ OPTIONS = {
|
||||
|
||||
'fb2': ('no_inline_fb2_toc',),
|
||||
|
||||
'pdf': ('no_images', 'unwrap_factor', 'new_pdf_engine', 'pdf_header_skip', 'pdf_footer_skip', 'pdf_header_regex', 'pdf_footer_regex'),
|
||||
'pdf': ('no_images', 'unwrap_factor', 'pdf_engine', 'pdf_header_skip', 'pdf_footer_skip', 'pdf_header_regex', 'pdf_footer_regex'),
|
||||
|
||||
'rtf': ('ignore_wmf',),
|
||||
|
||||
|
@ -7,6 +7,8 @@ import os
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from polyglot.builtins import as_bytes
|
||||
|
||||
ENGINES = 'calibre', 'pdftohtml'
|
||||
|
||||
|
||||
class PDFInput(InputFormatPlugin):
|
||||
|
||||
@ -23,8 +25,10 @@ class PDFInput(InputFormatPlugin):
|
||||
help=_('Scale used to determine the length at which a line should '
|
||||
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
||||
'default is 0.45, just below the median line length.')),
|
||||
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
|
||||
help=_('Use the new, experimental, PDF conversion engine.')),
|
||||
OptionRecommendation(name='pdf_engine', recommended_value='calibre', choices=('calibre', 'pdftohtml'),
|
||||
help=_('The PDF engine to use, the "calibre" engine is recommended as it has automatic header and footer removal.'
|
||||
' Choices: {}'
|
||||
).format(', '.join(ENGINES))),
|
||||
OptionRecommendation(name='pdf_header_skip', recommended_value=-1,
|
||||
help=_('Skip everything to the specified number of pixels at the top of a page.'
|
||||
' Negative numbers mean auto-detect and remove headers, zero means do not remove headers and positive numbers'
|
||||
@ -35,14 +39,14 @@ class PDFInput(InputFormatPlugin):
|
||||
help=_('Skip everything to the specified number of pixels at the bottom of a page.'
|
||||
' Negative numbers mean auto-detect and remove footers, zero means do not remove footers and positive numbers'
|
||||
' mean remove footers that appear below that many pixels from the bottom of the page. Works only'
|
||||
' with the new PDF engine.'
|
||||
' with the calibre PDF engine.'
|
||||
)),
|
||||
OptionRecommendation(name='pdf_header_regex', recommended_value='',
|
||||
help=_('Regular expression to remove lines at the top of a page. '
|
||||
'This only looks at the first line of a page and works only with the new PDF engine.')),
|
||||
'This only looks at the first line of a page and works only with the calibre PDF engine.')),
|
||||
OptionRecommendation(name='pdf_footer_regex', recommended_value='',
|
||||
help=_('Regular expression to remove lines at the bottom of a page. '
|
||||
'This only looks at the last line of a page and works only with the new PDF engine.')),
|
||||
'This only looks at the last line of a page and works only with the calibre PDF engine.')),
|
||||
}
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
@ -53,7 +57,7 @@ class PDFInput(InputFormatPlugin):
|
||||
log.debug('Converting file to html...')
|
||||
# The main html file will be named index.html
|
||||
self.opts, self.log = options, log
|
||||
if options.new_pdf_engine:
|
||||
if options.pdf_engine == 'calibre':
|
||||
from calibre.ebooks.pdf.reflow import PDFDocument
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True)
|
||||
|
@ -17,17 +17,23 @@ class PluginWidget(Widget, Ui_Form):
|
||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||
Widget.__init__(self, parent, OPTIONS['input']['pdf'])
|
||||
self.db, self.book_id = db, book_id
|
||||
from calibre.ebooks.conversion.plugins.pdf_input import ENGINES
|
||||
self.opt_pdf_engine.addItems(ENGINES)
|
||||
self.initialize_options(get_option, get_help, db, book_id)
|
||||
self.opt_new_pdf_engine.toggled.connect(self.update_engine_opts)
|
||||
self.opt_pdf_engine.currentIndexChanged.connect(self.update_engine_opts)
|
||||
self.update_engine_opts()
|
||||
|
||||
def set_value_handler(self, g, val):
|
||||
if val is None and isinstance(g, QDoubleSpinBox):
|
||||
g.setValue(0.0)
|
||||
return True
|
||||
if g is self.opt_pdf_engine:
|
||||
idx = g.findText(val)
|
||||
if idx > -1:
|
||||
g.setCurrentIndex(idx)
|
||||
|
||||
def update_engine_opts(self):
|
||||
enabled = self.opt_new_pdf_engine.isChecked()
|
||||
enabled = self.opt_pdf_engine.currentText() == 'calibre'
|
||||
self.opt_pdf_footer_skip.setEnabled(enabled)
|
||||
self.opt_pdf_header_skip.setEnabled(enabled)
|
||||
self.opt_pdf_header_regex.setEnabled(enabled)
|
||||
|
@ -14,36 +14,6 @@
|
||||
<string>Form</string>
|
||||
</property>
|
||||
<layout class="QGridLayout" name="gridLayout">
|
||||
<item row="0" column="0">
|
||||
<widget class="QLabel" name="label_2">
|
||||
<property name="text">
|
||||
<string>Line &un-wrapping factor:</string>
|
||||
</property>
|
||||
<property name="buddy">
|
||||
<cstring>opt_unwrap_factor</cstring>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="0" column="1">
|
||||
<widget class="QDoubleSpinBox" name="opt_unwrap_factor">
|
||||
<property name="maximum">
|
||||
<double>1.000000000000000</double>
|
||||
</property>
|
||||
<property name="singleStep">
|
||||
<double>0.010000000000000</double>
|
||||
</property>
|
||||
<property name="value">
|
||||
<double>0.450000000000000</double>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="1" column="0">
|
||||
<widget class="QCheckBox" name="opt_no_images">
|
||||
<property name="text">
|
||||
<string>No &images</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="3" column="0">
|
||||
<widget class="QLabel" name="label_t">
|
||||
<property name="text">
|
||||
@ -54,35 +24,6 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="3" column="1">
|
||||
<widget class="QSpinBox" name="opt_pdf_header_skip">
|
||||
<property name="specialValueText">
|
||||
<string>Automatically</string>
|
||||
</property>
|
||||
<property name="suffix">
|
||||
<string> px</string>
|
||||
</property>
|
||||
<property name="minimum">
|
||||
<number>-1</number>
|
||||
</property>
|
||||
<property name="maximum">
|
||||
<number>99999</number>
|
||||
</property>
|
||||
<property name="value">
|
||||
<number>-1</number>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="4" column="0">
|
||||
<widget class="QLabel" name="label_b">
|
||||
<property name="text">
|
||||
<string>Remove footers at &bottom of page by:</string>
|
||||
</property>
|
||||
<property name="buddy">
|
||||
<cstring>opt_pdf_footer_skip</cstring>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="4" column="1">
|
||||
<widget class="QSpinBox" name="opt_pdf_footer_skip">
|
||||
<property name="specialValueText">
|
||||
@ -102,15 +43,8 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="5" column="0">
|
||||
<widget class="QLabel" name="label_rt">
|
||||
<property name="text">
|
||||
<string>Regular expression to remove &header at top of page:</string>
|
||||
</property>
|
||||
<property name="buddy">
|
||||
<cstring>opt_pdf_header_regex</cstring>
|
||||
</property>
|
||||
</widget>
|
||||
<item row="2" column="1">
|
||||
<widget class="QComboBox" name="opt_pdf_engine"/>
|
||||
</item>
|
||||
<item row="5" column="1">
|
||||
<widget class="QLineEdit" name="opt_pdf_header_regex">
|
||||
@ -119,16 +53,6 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="6" column="0">
|
||||
<widget class="QLabel" name="label_rb">
|
||||
<property name="text">
|
||||
<string>Regular expression to remove &footer at bottom of page:</string>
|
||||
</property>
|
||||
<property name="buddy">
|
||||
<cstring>opt_pdf_footer_regex</cstring>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="6" column="1">
|
||||
<widget class="QLineEdit" name="opt_pdf_footer_regex">
|
||||
<property name="clearButtonEnabled">
|
||||
@ -149,10 +73,92 @@
|
||||
</property>
|
||||
</spacer>
|
||||
</item>
|
||||
<item row="2" column="0" colspan="2">
|
||||
<widget class="QCheckBox" name="opt_new_pdf_engine">
|
||||
<item row="5" column="0">
|
||||
<widget class="QLabel" name="label_rt">
|
||||
<property name="text">
|
||||
<string>New, experimental, PDF conversion &engine</string>
|
||||
<string>Regular expression to remove &header at top of page:</string>
|
||||
</property>
|
||||
<property name="buddy">
|
||||
<cstring>opt_pdf_header_regex</cstring>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="1" column="0">
|
||||
<widget class="QCheckBox" name="opt_no_images">
|
||||
<property name="text">
|
||||
<string>No &images</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="6" column="0">
|
||||
<widget class="QLabel" name="label_rb">
|
||||
<property name="text">
|
||||
<string>Regular expression to remove &footer at bottom of page:</string>
|
||||
</property>
|
||||
<property name="buddy">
|
||||
<cstring>opt_pdf_footer_regex</cstring>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="0" column="1">
|
||||
<widget class="QDoubleSpinBox" name="opt_unwrap_factor">
|
||||
<property name="maximum">
|
||||
<double>1.000000000000000</double>
|
||||
</property>
|
||||
<property name="singleStep">
|
||||
<double>0.010000000000000</double>
|
||||
</property>
|
||||
<property name="value">
|
||||
<double>0.450000000000000</double>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="4" column="0">
|
||||
<widget class="QLabel" name="label_b">
|
||||
<property name="text">
|
||||
<string>Remove footers at &bottom of page by:</string>
|
||||
</property>
|
||||
<property name="buddy">
|
||||
<cstring>opt_pdf_footer_skip</cstring>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="2" column="0">
|
||||
<widget class="QLabel" name="label">
|
||||
<property name="text">
|
||||
<string>PDF &engine:</string>
|
||||
</property>
|
||||
<property name="buddy">
|
||||
<cstring>opt_pdf_engine</cstring>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="0" column="0">
|
||||
<widget class="QLabel" name="label_2">
|
||||
<property name="text">
|
||||
<string>Line &un-wrapping factor:</string>
|
||||
</property>
|
||||
<property name="buddy">
|
||||
<cstring>opt_unwrap_factor</cstring>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="3" column="1">
|
||||
<widget class="QSpinBox" name="opt_pdf_header_skip">
|
||||
<property name="specialValueText">
|
||||
<string>Automatically</string>
|
||||
</property>
|
||||
<property name="suffix">
|
||||
<string> px</string>
|
||||
</property>
|
||||
<property name="minimum">
|
||||
<number>-1</number>
|
||||
</property>
|
||||
<property name="maximum">
|
||||
<number>99999</number>
|
||||
</property>
|
||||
<property name="value">
|
||||
<number>-1</number>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
|
@ -487,7 +487,7 @@ def pdf_input(container):
|
||||
container.appendChild(g)
|
||||
g.appendChild(float_spin('unwrap_factor', _('Line &un-wrapping factor:'), max=1, step=0.01))
|
||||
g.appendChild(checkbox('no_images', _('No &images')))
|
||||
g.appendChild(checkbox('new_pdf_engine', _('New, experimental, PDF conversion &engine')))
|
||||
g.appendChild(choices('pdf_engine', _('PDF &engine:'), {'calibre': 'calibre', 'pdftohtml': 'pdftohtml'}))
|
||||
g.appendChild(int_spin('pdf_header_skip', _('Remove headers at &top of page by:'), min=-1, max=999999, step=1))
|
||||
g.appendChild(int_spin('pdf_footer_skip', _('Remove footers at &bottom of page by:'), min=-1, max=999999, step=1))
|
||||
g.appendChild(lineedit('pdf_header_regex', _('Regular expression to remove &header at top of page:')))
|
||||
|
Loading…
x
Reference in New Issue
Block a user