mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
New engine to convert PDF files with support for automatic header/footer removal. Fixes #2076346 [PDF conversion new engine does not work](https://bugs.launchpad.net/calibre/+bug/2076346)
This commit is contained in:
parent
f4a4f19786
commit
a8fb98459b
@ -221,7 +221,7 @@ OPTIONS = {
|
|||||||
|
|
||||||
'fb2': ('no_inline_fb2_toc',),
|
'fb2': ('no_inline_fb2_toc',),
|
||||||
|
|
||||||
'pdf': ('no_images', 'unwrap_factor'),
|
'pdf': ('no_images', 'unwrap_factor', 'new_pdf_engine', 'pdf_header_skip', 'pdf_footer_skip', 'pdf_header_regex', 'pdf_footer_regex'),
|
||||||
|
|
||||||
'rtf': ('ignore_wmf',),
|
'rtf': ('ignore_wmf',),
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@ from polyglot.builtins import as_bytes
|
|||||||
class PDFInput(InputFormatPlugin):
|
class PDFInput(InputFormatPlugin):
|
||||||
|
|
||||||
name = 'PDF Input'
|
name = 'PDF Input'
|
||||||
author = 'Kovid Goyal and John Schember'
|
author = 'Kovid Goyal and John Schember and Alan Pettigrew'
|
||||||
description = _('Convert PDF files to HTML')
|
description = _('Convert PDF files to HTML')
|
||||||
file_types = {'pdf'}
|
file_types = {'pdf'}
|
||||||
commit_name = 'pdf_input'
|
commit_name = 'pdf_input'
|
||||||
@ -24,20 +24,27 @@ class PDFInput(InputFormatPlugin):
|
|||||||
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
||||||
'default is 0.45, just below the median line length.')),
|
'default is 0.45, just below the median line length.')),
|
||||||
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
|
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
|
||||||
help=_('Use the new PDF conversion engine. Currently not operational.'))
|
help=_('Use the new, experimental, PDF conversion engine.')),
|
||||||
|
OptionRecommendation(name='pdf_header_skip', recommended_value=-1,
|
||||||
|
help=_('Skip everything to the specified number pixels at the top of a page.'
|
||||||
|
' Negative numbers mean auto-detect and remove headers, zero means do not remove headers and positive numbers'
|
||||||
|
' mean remove headers that appear above that many pixels from the top of the page. Works only'
|
||||||
|
' with the new PDF engine.'
|
||||||
|
)),
|
||||||
|
OptionRecommendation(name='pdf_footer_skip', recommended_value=-1,
|
||||||
|
help=_('Skip everything to the specified number of pixels at the bottom of a page.'
|
||||||
|
' Negative numbers mean auto-detect and remove footers, zero means do not remove footers and positive numbers'
|
||||||
|
' mean remove footers that appear below that many pixels from the bottom of the page. Works only'
|
||||||
|
' with the new PDF engine.'
|
||||||
|
)),
|
||||||
|
OptionRecommendation(name='pdf_header_regex', recommended_value='',
|
||||||
|
help=_('Regular expression to remove lines at the top of a page. '
|
||||||
|
'This only looks at the first line of a page and works only with the new PDF engine.')),
|
||||||
|
OptionRecommendation(name='pdf_footer_regex', recommended_value='',
|
||||||
|
help=_('Regular expression to remove lines at the bottom of a page. '
|
||||||
|
'This only looks at the last line of a page and works only with the new PDF engine.')),
|
||||||
}
|
}
|
||||||
|
|
||||||
def convert_new(self, stream, accelerators):
|
|
||||||
from calibre.ebooks.pdf.pdftohtml import pdftohtml
|
|
||||||
from calibre.ebooks.pdf.reflow import PDFDocument
|
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
|
||||||
|
|
||||||
pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True)
|
|
||||||
with open('index.xml', 'rb') as f:
|
|
||||||
xml = clean_ascii_chars(f.read())
|
|
||||||
PDFDocument(xml, self.opts, self.log)
|
|
||||||
return os.path.join(os.getcwd(), 'metadata.opf')
|
|
||||||
|
|
||||||
def convert(self, stream, options, file_ext, log,
|
def convert(self, stream, options, file_ext, log,
|
||||||
accelerators):
|
accelerators):
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
@ -47,8 +54,14 @@ class PDFInput(InputFormatPlugin):
|
|||||||
# The main html file will be named index.html
|
# The main html file will be named index.html
|
||||||
self.opts, self.log = options, log
|
self.opts, self.log = options, log
|
||||||
if options.new_pdf_engine:
|
if options.new_pdf_engine:
|
||||||
return self.convert_new(stream, accelerators)
|
from calibre.ebooks.pdf.reflow import PDFDocument
|
||||||
pdftohtml(os.getcwd(), stream.name, options.no_images)
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
|
pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True)
|
||||||
|
with open(u'index.xml', 'rb') as f:
|
||||||
|
xml = clean_ascii_chars(f.read())
|
||||||
|
PDFDocument(xml, self.opts, self.log)
|
||||||
|
else:
|
||||||
|
pdftohtml(os.getcwd(), stream.name, options.no_images)
|
||||||
|
|
||||||
from calibre.ebooks.metadata.meta import get_metadata
|
from calibre.ebooks.metadata.meta import get_metadata
|
||||||
log.debug('Retrieving document metadata...')
|
log.debug('Retrieving document metadata...')
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -18,8 +18,17 @@ class PluginWidget(Widget, Ui_Form):
|
|||||||
Widget.__init__(self, parent, OPTIONS['input']['pdf'])
|
Widget.__init__(self, parent, OPTIONS['input']['pdf'])
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
self.initialize_options(get_option, get_help, db, book_id)
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
|
self.opt_new_pdf_engine.toggled.connect(self.update_engine_opts)
|
||||||
|
self.update_engine_opts()
|
||||||
|
|
||||||
def set_value_handler(self, g, val):
|
def set_value_handler(self, g, val):
|
||||||
if val is None and isinstance(g, QDoubleSpinBox):
|
if val is None and isinstance(g, QDoubleSpinBox):
|
||||||
g.setValue(0.0)
|
g.setValue(0.0)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def update_engine_opts(self):
|
||||||
|
enabled = self.opt_new_pdf_engine.isChecked()
|
||||||
|
self.opt_pdf_footer_skip.setEnabled(enabled)
|
||||||
|
self.opt_pdf_header_skip.setEnabled(enabled)
|
||||||
|
self.opt_pdf_header_regex.setEnabled(enabled)
|
||||||
|
self.opt_pdf_footer_regex.setEnabled(enabled)
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
<rect>
|
<rect>
|
||||||
<x>0</x>
|
<x>0</x>
|
||||||
<y>0</y>
|
<y>0</y>
|
||||||
<width>400</width>
|
<width>413</width>
|
||||||
<height>300</height>
|
<height>300</height>
|
||||||
</rect>
|
</rect>
|
||||||
</property>
|
</property>
|
||||||
@ -24,19 +24,6 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="2" column="0">
|
|
||||||
<spacer name="verticalSpacer">
|
|
||||||
<property name="orientation">
|
|
||||||
<enum>Qt::Vertical</enum>
|
|
||||||
</property>
|
|
||||||
<property name="sizeHint" stdset="0">
|
|
||||||
<size>
|
|
||||||
<width>20</width>
|
|
||||||
<height>213</height>
|
|
||||||
</size>
|
|
||||||
</property>
|
|
||||||
</spacer>
|
|
||||||
</item>
|
|
||||||
<item row="0" column="1">
|
<item row="0" column="1">
|
||||||
<widget class="QDoubleSpinBox" name="opt_unwrap_factor">
|
<widget class="QDoubleSpinBox" name="opt_unwrap_factor">
|
||||||
<property name="maximum">
|
<property name="maximum">
|
||||||
@ -57,6 +44,118 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
|
<item row="3" column="0">
|
||||||
|
<widget class="QLabel" name="label_t">
|
||||||
|
<property name="text">
|
||||||
|
<string>Remove headers at &top of page by:</string>
|
||||||
|
</property>
|
||||||
|
<property name="buddy">
|
||||||
|
<cstring>opt_pdf_header_skip</cstring>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="3" column="1">
|
||||||
|
<widget class="QSpinBox" name="opt_pdf_header_skip">
|
||||||
|
<property name="specialValueText">
|
||||||
|
<string>Automatically</string>
|
||||||
|
</property>
|
||||||
|
<property name="suffix">
|
||||||
|
<string> px</string>
|
||||||
|
</property>
|
||||||
|
<property name="minimum">
|
||||||
|
<number>-1</number>
|
||||||
|
</property>
|
||||||
|
<property name="maximum">
|
||||||
|
<number>99999</number>
|
||||||
|
</property>
|
||||||
|
<property name="value">
|
||||||
|
<number>-1</number>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="4" column="0">
|
||||||
|
<widget class="QLabel" name="label_b">
|
||||||
|
<property name="text">
|
||||||
|
<string>Remove footers at &bottom of page by:</string>
|
||||||
|
</property>
|
||||||
|
<property name="buddy">
|
||||||
|
<cstring>opt_pdf_footer_skip</cstring>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="4" column="1">
|
||||||
|
<widget class="QSpinBox" name="opt_pdf_footer_skip">
|
||||||
|
<property name="specialValueText">
|
||||||
|
<string>Automatically</string>
|
||||||
|
</property>
|
||||||
|
<property name="suffix">
|
||||||
|
<string> px</string>
|
||||||
|
</property>
|
||||||
|
<property name="minimum">
|
||||||
|
<number>-1</number>
|
||||||
|
</property>
|
||||||
|
<property name="maximum">
|
||||||
|
<number>99999</number>
|
||||||
|
</property>
|
||||||
|
<property name="value">
|
||||||
|
<number>-1</number>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="5" column="0">
|
||||||
|
<widget class="QLabel" name="label_rt">
|
||||||
|
<property name="text">
|
||||||
|
<string>Regular expression to remove &header at top of page:</string>
|
||||||
|
</property>
|
||||||
|
<property name="buddy">
|
||||||
|
<cstring>opt_pdf_header_regex</cstring>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="5" column="1">
|
||||||
|
<widget class="QLineEdit" name="opt_pdf_header_regex">
|
||||||
|
<property name="clearButtonEnabled">
|
||||||
|
<bool>true</bool>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="6" column="0">
|
||||||
|
<widget class="QLabel" name="label_rb">
|
||||||
|
<property name="text">
|
||||||
|
<string>Regular expression to remove &footer at bottom of page:</string>
|
||||||
|
</property>
|
||||||
|
<property name="buddy">
|
||||||
|
<cstring>opt_pdf_footer_regex</cstring>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="6" column="1">
|
||||||
|
<widget class="QLineEdit" name="opt_pdf_footer_regex">
|
||||||
|
<property name="clearButtonEnabled">
|
||||||
|
<bool>true</bool>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="7" column="0">
|
||||||
|
<spacer name="verticalSpacer">
|
||||||
|
<property name="orientation">
|
||||||
|
<enum>Qt::Vertical</enum>
|
||||||
|
</property>
|
||||||
|
<property name="sizeHint" stdset="0">
|
||||||
|
<size>
|
||||||
|
<width>20</width>
|
||||||
|
<height>20</height>
|
||||||
|
</size>
|
||||||
|
</property>
|
||||||
|
</spacer>
|
||||||
|
</item>
|
||||||
|
<item row="2" column="0" colspan="2">
|
||||||
|
<widget class="QCheckBox" name="opt_new_pdf_engine">
|
||||||
|
<property name="text">
|
||||||
|
<string>&New, experimental, PDF conversion Engine</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
</layout>
|
</layout>
|
||||||
</widget>
|
</widget>
|
||||||
<resources/>
|
<resources/>
|
||||||
|
@ -487,6 +487,11 @@ def pdf_input(container):
|
|||||||
container.appendChild(g)
|
container.appendChild(g)
|
||||||
g.appendChild(float_spin('unwrap_factor', _('Line &un-wrapping factor:'), max=1, step=0.01))
|
g.appendChild(float_spin('unwrap_factor', _('Line &un-wrapping factor:'), max=1, step=0.01))
|
||||||
g.appendChild(checkbox('no_images', _('No &images')))
|
g.appendChild(checkbox('no_images', _('No &images')))
|
||||||
|
g.appendChild(checkbox('new_pdf_engine', _('New, experimental, PDF conversion &engine')))
|
||||||
|
g.appendChild(int_spin('pdf_header_skip', _('Remove headers at &top of page by:'), min=-1, max=999999, step=1))
|
||||||
|
g.appendChild(int_spin('pdf_footer_skip', _('Remove footers at &bottom of page by:'), min=-1, max=999999, step=1))
|
||||||
|
g.appendChild(lineedit('pdf_header_regex', _('Regular expression to remove &header at top of page:')))
|
||||||
|
g.appendChild(lineedit('pdf_footer_regex', _('Regular expression to remove &footer at bottom of page:')))
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
# RTF Input {{{
|
# RTF Input {{{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user