mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
New engine to convert PDF files with support for automatic header/footer removal. Fixes #2076346 [PDF conversion new engine does not work](https://bugs.launchpad.net/calibre/+bug/2076346)
This commit is contained in:
parent
f4a4f19786
commit
a8fb98459b
@ -221,7 +221,7 @@ OPTIONS = {
|
||||
|
||||
'fb2': ('no_inline_fb2_toc',),
|
||||
|
||||
'pdf': ('no_images', 'unwrap_factor'),
|
||||
'pdf': ('no_images', 'unwrap_factor', 'new_pdf_engine', 'pdf_header_skip', 'pdf_footer_skip', 'pdf_header_regex', 'pdf_footer_regex'),
|
||||
|
||||
'rtf': ('ignore_wmf',),
|
||||
|
||||
|
@ -11,7 +11,7 @@ from polyglot.builtins import as_bytes
|
||||
class PDFInput(InputFormatPlugin):
|
||||
|
||||
name = 'PDF Input'
|
||||
author = 'Kovid Goyal and John Schember'
|
||||
author = 'Kovid Goyal and John Schember and Alan Pettigrew'
|
||||
description = _('Convert PDF files to HTML')
|
||||
file_types = {'pdf'}
|
||||
commit_name = 'pdf_input'
|
||||
@ -24,20 +24,27 @@ class PDFInput(InputFormatPlugin):
|
||||
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
||||
'default is 0.45, just below the median line length.')),
|
||||
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
|
||||
help=_('Use the new PDF conversion engine. Currently not operational.'))
|
||||
help=_('Use the new, experimental, PDF conversion engine.')),
|
||||
OptionRecommendation(name='pdf_header_skip', recommended_value=-1,
|
||||
help=_('Skip everything to the specified number pixels at the top of a page.'
|
||||
' Negative numbers mean auto-detect and remove headers, zero means do not remove headers and positive numbers'
|
||||
' mean remove headers that appear above that many pixels from the top of the page. Works only'
|
||||
' with the new PDF engine.'
|
||||
)),
|
||||
OptionRecommendation(name='pdf_footer_skip', recommended_value=-1,
|
||||
help=_('Skip everything to the specified number of pixels at the bottom of a page.'
|
||||
' Negative numbers mean auto-detect and remove footers, zero means do not remove footers and positive numbers'
|
||||
' mean remove footers that appear below that many pixels from the bottom of the page. Works only'
|
||||
' with the new PDF engine.'
|
||||
)),
|
||||
OptionRecommendation(name='pdf_header_regex', recommended_value='',
|
||||
help=_('Regular expression to remove lines at the top of a page. '
|
||||
'This only looks at the first line of a page and works only with the new PDF engine.')),
|
||||
OptionRecommendation(name='pdf_footer_regex', recommended_value='',
|
||||
help=_('Regular expression to remove lines at the bottom of a page. '
|
||||
'This only looks at the last line of a page and works only with the new PDF engine.')),
|
||||
}
|
||||
|
||||
def convert_new(self, stream, accelerators):
|
||||
from calibre.ebooks.pdf.pdftohtml import pdftohtml
|
||||
from calibre.ebooks.pdf.reflow import PDFDocument
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
|
||||
pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True)
|
||||
with open('index.xml', 'rb') as f:
|
||||
xml = clean_ascii_chars(f.read())
|
||||
PDFDocument(xml, self.opts, self.log)
|
||||
return os.path.join(os.getcwd(), 'metadata.opf')
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
@ -47,8 +54,14 @@ class PDFInput(InputFormatPlugin):
|
||||
# The main html file will be named index.html
|
||||
self.opts, self.log = options, log
|
||||
if options.new_pdf_engine:
|
||||
return self.convert_new(stream, accelerators)
|
||||
pdftohtml(os.getcwd(), stream.name, options.no_images)
|
||||
from calibre.ebooks.pdf.reflow import PDFDocument
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True)
|
||||
with open(u'index.xml', 'rb') as f:
|
||||
xml = clean_ascii_chars(f.read())
|
||||
PDFDocument(xml, self.opts, self.log)
|
||||
else:
|
||||
pdftohtml(os.getcwd(), stream.name, options.no_images)
|
||||
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
log.debug('Retrieving document metadata...')
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -18,8 +18,17 @@ class PluginWidget(Widget, Ui_Form):
|
||||
Widget.__init__(self, parent, OPTIONS['input']['pdf'])
|
||||
self.db, self.book_id = db, book_id
|
||||
self.initialize_options(get_option, get_help, db, book_id)
|
||||
self.opt_new_pdf_engine.toggled.connect(self.update_engine_opts)
|
||||
self.update_engine_opts()
|
||||
|
||||
def set_value_handler(self, g, val):
|
||||
if val is None and isinstance(g, QDoubleSpinBox):
|
||||
g.setValue(0.0)
|
||||
return True
|
||||
|
||||
def update_engine_opts(self):
|
||||
enabled = self.opt_new_pdf_engine.isChecked()
|
||||
self.opt_pdf_footer_skip.setEnabled(enabled)
|
||||
self.opt_pdf_header_skip.setEnabled(enabled)
|
||||
self.opt_pdf_header_regex.setEnabled(enabled)
|
||||
self.opt_pdf_footer_regex.setEnabled(enabled)
|
||||
|
@ -6,7 +6,7 @@
|
||||
<rect>
|
||||
<x>0</x>
|
||||
<y>0</y>
|
||||
<width>400</width>
|
||||
<width>413</width>
|
||||
<height>300</height>
|
||||
</rect>
|
||||
</property>
|
||||
@ -24,19 +24,6 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="2" column="0">
|
||||
<spacer name="verticalSpacer">
|
||||
<property name="orientation">
|
||||
<enum>Qt::Vertical</enum>
|
||||
</property>
|
||||
<property name="sizeHint" stdset="0">
|
||||
<size>
|
||||
<width>20</width>
|
||||
<height>213</height>
|
||||
</size>
|
||||
</property>
|
||||
</spacer>
|
||||
</item>
|
||||
<item row="0" column="1">
|
||||
<widget class="QDoubleSpinBox" name="opt_unwrap_factor">
|
||||
<property name="maximum">
|
||||
@ -57,6 +44,118 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="3" column="0">
|
||||
<widget class="QLabel" name="label_t">
|
||||
<property name="text">
|
||||
<string>Remove headers at &top of page by:</string>
|
||||
</property>
|
||||
<property name="buddy">
|
||||
<cstring>opt_pdf_header_skip</cstring>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="3" column="1">
|
||||
<widget class="QSpinBox" name="opt_pdf_header_skip">
|
||||
<property name="specialValueText">
|
||||
<string>Automatically</string>
|
||||
</property>
|
||||
<property name="suffix">
|
||||
<string> px</string>
|
||||
</property>
|
||||
<property name="minimum">
|
||||
<number>-1</number>
|
||||
</property>
|
||||
<property name="maximum">
|
||||
<number>99999</number>
|
||||
</property>
|
||||
<property name="value">
|
||||
<number>-1</number>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="4" column="0">
|
||||
<widget class="QLabel" name="label_b">
|
||||
<property name="text">
|
||||
<string>Remove footers at &bottom of page by:</string>
|
||||
</property>
|
||||
<property name="buddy">
|
||||
<cstring>opt_pdf_footer_skip</cstring>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="4" column="1">
|
||||
<widget class="QSpinBox" name="opt_pdf_footer_skip">
|
||||
<property name="specialValueText">
|
||||
<string>Automatically</string>
|
||||
</property>
|
||||
<property name="suffix">
|
||||
<string> px</string>
|
||||
</property>
|
||||
<property name="minimum">
|
||||
<number>-1</number>
|
||||
</property>
|
||||
<property name="maximum">
|
||||
<number>99999</number>
|
||||
</property>
|
||||
<property name="value">
|
||||
<number>-1</number>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="5" column="0">
|
||||
<widget class="QLabel" name="label_rt">
|
||||
<property name="text">
|
||||
<string>Regular expression to remove &header at top of page:</string>
|
||||
</property>
|
||||
<property name="buddy">
|
||||
<cstring>opt_pdf_header_regex</cstring>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="5" column="1">
|
||||
<widget class="QLineEdit" name="opt_pdf_header_regex">
|
||||
<property name="clearButtonEnabled">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="6" column="0">
|
||||
<widget class="QLabel" name="label_rb">
|
||||
<property name="text">
|
||||
<string>Regular expression to remove &footer at bottom of page:</string>
|
||||
</property>
|
||||
<property name="buddy">
|
||||
<cstring>opt_pdf_footer_regex</cstring>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="6" column="1">
|
||||
<widget class="QLineEdit" name="opt_pdf_footer_regex">
|
||||
<property name="clearButtonEnabled">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="7" column="0">
|
||||
<spacer name="verticalSpacer">
|
||||
<property name="orientation">
|
||||
<enum>Qt::Vertical</enum>
|
||||
</property>
|
||||
<property name="sizeHint" stdset="0">
|
||||
<size>
|
||||
<width>20</width>
|
||||
<height>20</height>
|
||||
</size>
|
||||
</property>
|
||||
</spacer>
|
||||
</item>
|
||||
<item row="2" column="0" colspan="2">
|
||||
<widget class="QCheckBox" name="opt_new_pdf_engine">
|
||||
<property name="text">
|
||||
<string>&New, experimental, PDF conversion Engine</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
<resources/>
|
||||
|
@ -487,6 +487,11 @@ def pdf_input(container):
|
||||
container.appendChild(g)
|
||||
g.appendChild(float_spin('unwrap_factor', _('Line &un-wrapping factor:'), max=1, step=0.01))
|
||||
g.appendChild(checkbox('no_images', _('No &images')))
|
||||
g.appendChild(checkbox('new_pdf_engine', _('New, experimental, PDF conversion &engine')))
|
||||
g.appendChild(int_spin('pdf_header_skip', _('Remove headers at &top of page by:'), min=-1, max=999999, step=1))
|
||||
g.appendChild(int_spin('pdf_footer_skip', _('Remove footers at &bottom of page by:'), min=-1, max=999999, step=1))
|
||||
g.appendChild(lineedit('pdf_header_regex', _('Regular expression to remove &header at top of page:')))
|
||||
g.appendChild(lineedit('pdf_footer_regex', _('Regular expression to remove &footer at bottom of page:')))
|
||||
# }}}
|
||||
|
||||
# RTF Input {{{
|
||||
|
Loading…
x
Reference in New Issue
Block a user