From 8359e89caccec27dda4a0311178d13ee9ff91212 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 17 Oct 2024 11:11:09 +0530 Subject: [PATCH] PDF Input: Automatic header/footer detection and removal A new, dedicated PDF Input engine for calibre that implements automatic detection of headers and footers based on document analysis. The new engine is the default to go back to using the old engine, select it in the PDF Input section of the Conversion dialog. See #2076346 (PDF conversion new engine does not work) --- src/calibre/ebooks/conversion/config.py | 2 +- .../ebooks/conversion/plugins/pdf_input.py | 16 +- src/calibre/gui2/convert/pdf_input.py | 10 +- src/calibre/gui2/convert/pdf_input.ui | 168 +++++++++--------- src/pyj/book_list/conversion_widgets.pyj | 2 +- 5 files changed, 107 insertions(+), 91 deletions(-) diff --git a/src/calibre/ebooks/conversion/config.py b/src/calibre/ebooks/conversion/config.py index b7591d8f03..9d39528a04 100644 --- a/src/calibre/ebooks/conversion/config.py +++ b/src/calibre/ebooks/conversion/config.py @@ -221,7 +221,7 @@ OPTIONS = { 'fb2': ('no_inline_fb2_toc',), - 'pdf': ('no_images', 'unwrap_factor', 'new_pdf_engine', 'pdf_header_skip', 'pdf_footer_skip', 'pdf_header_regex', 'pdf_footer_regex'), + 'pdf': ('no_images', 'unwrap_factor', 'pdf_engine', 'pdf_header_skip', 'pdf_footer_skip', 'pdf_header_regex', 'pdf_footer_regex'), 'rtf': ('ignore_wmf',), diff --git a/src/calibre/ebooks/conversion/plugins/pdf_input.py b/src/calibre/ebooks/conversion/plugins/pdf_input.py index 2408f79b58..2b3d6aa560 100644 --- a/src/calibre/ebooks/conversion/plugins/pdf_input.py +++ b/src/calibre/ebooks/conversion/plugins/pdf_input.py @@ -7,6 +7,8 @@ import os from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from polyglot.builtins import as_bytes +ENGINES = 'calibre', 'pdftohtml' + class PDFInput(InputFormatPlugin): @@ -23,8 +25,10 @@ class PDFInput(InputFormatPlugin): help=_('Scale used to determine the length at which a line should ' 'be unwrapped. Valid values are a decimal between 0 and 1. The ' 'default is 0.45, just below the median line length.')), - OptionRecommendation(name='new_pdf_engine', recommended_value=False, - help=_('Use the new, experimental, PDF conversion engine.')), + OptionRecommendation(name='pdf_engine', recommended_value='calibre', choices=('calibre', 'pdftohtml'), + help=_('The PDF engine to use, the "calibre" engine is recommended as it has automatic header and footer removal.' + ' Choices: {}' + ).format(', '.join(ENGINES))), OptionRecommendation(name='pdf_header_skip', recommended_value=-1, help=_('Skip everything to the specified number of pixels at the top of a page.' ' Negative numbers mean auto-detect and remove headers, zero means do not remove headers and positive numbers' @@ -35,14 +39,14 @@ class PDFInput(InputFormatPlugin): help=_('Skip everything to the specified number of pixels at the bottom of a page.' ' Negative numbers mean auto-detect and remove footers, zero means do not remove footers and positive numbers' ' mean remove footers that appear below that many pixels from the bottom of the page. Works only' - ' with the new PDF engine.' + ' with the calibre PDF engine.' )), OptionRecommendation(name='pdf_header_regex', recommended_value='', help=_('Regular expression to remove lines at the top of a page. ' - 'This only looks at the first line of a page and works only with the new PDF engine.')), + 'This only looks at the first line of a page and works only with the calibre PDF engine.')), OptionRecommendation(name='pdf_footer_regex', recommended_value='', help=_('Regular expression to remove lines at the bottom of a page. ' - 'This only looks at the last line of a page and works only with the new PDF engine.')), + 'This only looks at the last line of a page and works only with the calibre PDF engine.')), } def convert(self, stream, options, file_ext, log, @@ -53,7 +57,7 @@ class PDFInput(InputFormatPlugin): log.debug('Converting file to html...') # The main html file will be named index.html self.opts, self.log = options, log - if options.new_pdf_engine: + if options.pdf_engine == 'calibre': from calibre.ebooks.pdf.reflow import PDFDocument from calibre.utils.cleantext import clean_ascii_chars pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True) diff --git a/src/calibre/gui2/convert/pdf_input.py b/src/calibre/gui2/convert/pdf_input.py index a431660978..615f0947e2 100644 --- a/src/calibre/gui2/convert/pdf_input.py +++ b/src/calibre/gui2/convert/pdf_input.py @@ -17,17 +17,23 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, OPTIONS['input']['pdf']) self.db, self.book_id = db, book_id + from calibre.ebooks.conversion.plugins.pdf_input import ENGINES + self.opt_pdf_engine.addItems(ENGINES) self.initialize_options(get_option, get_help, db, book_id) - self.opt_new_pdf_engine.toggled.connect(self.update_engine_opts) + self.opt_pdf_engine.currentIndexChanged.connect(self.update_engine_opts) self.update_engine_opts() def set_value_handler(self, g, val): if val is None and isinstance(g, QDoubleSpinBox): g.setValue(0.0) return True + if g is self.opt_pdf_engine: + idx = g.findText(val) + if idx > -1: + g.setCurrentIndex(idx) def update_engine_opts(self): - enabled = self.opt_new_pdf_engine.isChecked() + enabled = self.opt_pdf_engine.currentText() == 'calibre' self.opt_pdf_footer_skip.setEnabled(enabled) self.opt_pdf_header_skip.setEnabled(enabled) self.opt_pdf_header_regex.setEnabled(enabled) diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui index e966f8a6f6..194c8e6f85 100644 --- a/src/calibre/gui2/convert/pdf_input.ui +++ b/src/calibre/gui2/convert/pdf_input.ui @@ -14,36 +14,6 @@ Form - - - - Line &un-wrapping factor: - - - opt_unwrap_factor - - - - - - - 1.000000000000000 - - - 0.010000000000000 - - - 0.450000000000000 - - - - - - - No &images - - - @@ -54,35 +24,6 @@ - - - - Automatically - - - px - - - -1 - - - 99999 - - - -1 - - - - - - - Remove footers at &bottom of page by: - - - opt_pdf_footer_skip - - - @@ -102,15 +43,8 @@ - - - - Regular expression to remove &header at top of page: - - - opt_pdf_header_regex - - + + @@ -119,16 +53,6 @@ - - - - Regular expression to remove &footer at bottom of page: - - - opt_pdf_footer_regex - - - @@ -149,10 +73,92 @@ - - + + - New, experimental, PDF conversion &engine + Regular expression to remove &header at top of page: + + + opt_pdf_header_regex + + + + + + + No &images + + + + + + + Regular expression to remove &footer at bottom of page: + + + opt_pdf_footer_regex + + + + + + + 1.000000000000000 + + + 0.010000000000000 + + + 0.450000000000000 + + + + + + + Remove footers at &bottom of page by: + + + opt_pdf_footer_skip + + + + + + + PDF &engine: + + + opt_pdf_engine + + + + + + + Line &un-wrapping factor: + + + opt_unwrap_factor + + + + + + + Automatically + + + px + + + -1 + + + 99999 + + + -1 diff --git a/src/pyj/book_list/conversion_widgets.pyj b/src/pyj/book_list/conversion_widgets.pyj index e56e60cdf1..35fe521616 100644 --- a/src/pyj/book_list/conversion_widgets.pyj +++ b/src/pyj/book_list/conversion_widgets.pyj @@ -487,7 +487,7 @@ def pdf_input(container): container.appendChild(g) g.appendChild(float_spin('unwrap_factor', _('Line &un-wrapping factor:'), max=1, step=0.01)) g.appendChild(checkbox('no_images', _('No &images'))) - g.appendChild(checkbox('new_pdf_engine', _('New, experimental, PDF conversion &engine'))) + g.appendChild(choices('pdf_engine', _('PDF &engine:'), {'calibre': 'calibre', 'pdftohtml': 'pdftohtml'})) g.appendChild(int_spin('pdf_header_skip', _('Remove headers at &top of page by:'), min=-1, max=999999, step=1)) g.appendChild(int_spin('pdf_footer_skip', _('Remove footers at &bottom of page by:'), min=-1, max=999999, step=1)) g.appendChild(lineedit('pdf_header_regex', _('Regular expression to remove &header at top of page:')))