mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
PDF Input: User can specify regex to use to remove header and footer. Preprocessor: Able to use options from input plugins.
This commit is contained in:
parent
8e1f51d8cb
commit
eb896d010f
@ -694,7 +694,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
|
||||
'''
|
||||
from calibre.ebooks.oeb.base import OEBBook
|
||||
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
|
||||
opts.preprocess_html, getattr(opts, 'pdf_line_length', 0.5))
|
||||
opts.preprocess_html, opts)
|
||||
oeb = OEBBook(log, html_preprocessor,
|
||||
pretty_print=opts.pretty_print, input_encoding=encoding)
|
||||
if not populate:
|
||||
|
@ -140,8 +140,6 @@ class HTMLPreProcessor(object):
|
||||
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
||||
# Connect paragraphs split by -
|
||||
(re.compile(u'(?<=[^\s][-–])[\s]*(</p>)*[\s]*(<p>)*\s*(?=[^\s])'), lambda match: ''),
|
||||
# Remove - that splits words
|
||||
(re.compile(u'(?<=[^\s])[-–]+(?=[^\s])'), lambda match: ''),
|
||||
# Add space before and after italics
|
||||
(re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
|
||||
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
|
||||
@ -163,10 +161,10 @@ class HTMLPreProcessor(object):
|
||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||
]
|
||||
def __init__(self, input_plugin_preprocess, plugin_preprocess,
|
||||
pdf_line_length):
|
||||
extra_opts=None):
|
||||
self.input_plugin_preprocess = input_plugin_preprocess
|
||||
self.plugin_preprocess = plugin_preprocess
|
||||
self.pdf_line_length = pdf_line_length
|
||||
self.extra_opts = extra_opts
|
||||
|
||||
def is_baen(self, src):
|
||||
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
|
||||
@ -187,15 +185,26 @@ class HTMLPreProcessor(object):
|
||||
elif self.is_book_designer(html):
|
||||
rules = self.BOOK_DESIGNER
|
||||
elif self.is_pdftohtml(html):
|
||||
length = line_length(html, self.pdf_line_length)
|
||||
line_length_rules = []
|
||||
if length:
|
||||
line_length_rules = [
|
||||
# Un wrap using punctuation
|
||||
(re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
|
||||
]
|
||||
start_rules = []
|
||||
end_rules = []
|
||||
|
||||
rules = self.PDFTOHTML + line_length_rules
|
||||
if getattr(self.extra_opts, 'remove_header', None):
|
||||
start_rules.append(
|
||||
(re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '')
|
||||
)
|
||||
if getattr(self.extra_opts, 'remove_footer', None):
|
||||
start_rules.append(
|
||||
(re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '')
|
||||
)
|
||||
if getattr(self.extra_opts, 'unwrap_factor', None):
|
||||
length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
|
||||
if length:
|
||||
end_rules.append(
|
||||
# Un wrap using punctuation
|
||||
(re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
|
||||
)
|
||||
|
||||
rules = start_rules + self.PDFTOHTML + end_rules
|
||||
else:
|
||||
rules = []
|
||||
for rule in self.PREPROCESS + rules:
|
||||
|
@ -20,10 +20,20 @@ class PDFInput(InputFormatPlugin):
|
||||
options = set([
|
||||
OptionRecommendation(name='no_images', recommended_value=False,
|
||||
help=_('Do not extract images from the document')),
|
||||
OptionRecommendation(name='pdf_line_length', recommended_value=0.5,
|
||||
OptionRecommendation(name='unwrap_factor', recommended_value=0.5,
|
||||
help=_('Scale used to determine the length at which a line should '
|
||||
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
||||
'default is 0.5, this is the median line length.')),
|
||||
OptionRecommendation(name='remove_header', recommended_value=False,
|
||||
help=_('Use a regular expression to try and remove the header.')),
|
||||
OptionRecommendation(name='header_regex',
|
||||
recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
|
||||
help=_('The regular expression to use to remove the header.')),
|
||||
OptionRecommendation(name='remove_footer', recommended_value=False,
|
||||
help=_('Use a regular expression to try and remove the footer.')),
|
||||
OptionRecommendation(name='footer_regex',
|
||||
recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
|
||||
help=_('The regular expression to use to remove the footer.')),
|
||||
])
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
@ -42,12 +52,7 @@ class PDFInput(InputFormatPlugin):
|
||||
images = os.listdir(os.getcwd())
|
||||
images.remove('index.html')
|
||||
for i in images:
|
||||
# Remove the - from the file name because it causes problems.
|
||||
# The reference to the image with the - will be changed to not
|
||||
# include it later in the conversion process.
|
||||
new_i = i.replace('-', '')
|
||||
os.rename(i, new_i)
|
||||
manifest.append((new_i, None))
|
||||
manifest.append((i, None))
|
||||
log.debug('Generating manifest...')
|
||||
opf.create_manifest(manifest)
|
||||
|
||||
|
@ -4,8 +4,13 @@ __license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
from PyQt4.Qt import SIGNAL
|
||||
|
||||
from calibre.gui2.convert.pdf_input_ui import Ui_Form
|
||||
from calibre.gui2.convert import Widget
|
||||
from calibre.gui2 import qstring_to_unicode, error_dialog
|
||||
|
||||
class PluginWidget(Widget, Ui_Form):
|
||||
|
||||
@ -14,6 +19,31 @@ class PluginWidget(Widget, Ui_Form):
|
||||
|
||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||
Widget.__init__(self, parent, 'pdf_input',
|
||||
['no_images', 'pdf_line_length'])
|
||||
['no_images', 'unwrap_factor', 'remove_header', 'header_regex',
|
||||
'remove_footer', 'footer_regex'])
|
||||
self.db, self.book_id = db, book_id
|
||||
self.initialize_options(get_option, get_help, db, book_id)
|
||||
|
||||
self.opt_header_regex.setEnabled(self.opt_remove_header.isChecked())
|
||||
self.opt_footer_regex.setEnabled(self.opt_remove_footer.isChecked())
|
||||
|
||||
self.connect(self.opt_remove_header, SIGNAL('stateChanged(int)'), self.header_regex_state)
|
||||
self.connect(self.opt_remove_footer, SIGNAL('stateChanged(int)'), self.footer_regex_state)
|
||||
|
||||
def header_regex_state(self, state):
|
||||
self.opt_header_regex.setEnabled(state)
|
||||
|
||||
def footer_regex_state(self, state):
|
||||
self.opt_footer_regex.setEnabled(state)
|
||||
|
||||
def pre_commit_check(self):
|
||||
for x in ('header_regex', 'footer_regex'):
|
||||
x = getattr(self, 'opt_'+x)
|
||||
try:
|
||||
pat = qstring_to_unicode(x.text())
|
||||
re.compile(pat)
|
||||
except Exception, err:
|
||||
error_dialog(self, _('Invalid regular expression'),
|
||||
_('Invalid regular expression: %s')%err).exec_()
|
||||
return False
|
||||
return True
|
||||
|
@ -14,14 +14,14 @@
|
||||
<string>Form</string>
|
||||
</property>
|
||||
<layout class="QGridLayout" name="gridLayout">
|
||||
<item row="1" column="0">
|
||||
<item row="2" column="0">
|
||||
<widget class="QLabel" name="label_2">
|
||||
<property name="text">
|
||||
<string>Line Un-Wrapping Factor:</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="3" column="0">
|
||||
<item row="4" column="0">
|
||||
<spacer name="verticalSpacer">
|
||||
<property name="orientation">
|
||||
<enum>Qt::Vertical</enum>
|
||||
@ -34,8 +34,8 @@
|
||||
</property>
|
||||
</spacer>
|
||||
</item>
|
||||
<item row="1" column="1">
|
||||
<widget class="QDoubleSpinBox" name="opt_pdf_line_length">
|
||||
<item row="2" column="1">
|
||||
<widget class="QDoubleSpinBox" name="opt_unwrap_factor">
|
||||
<property name="maximum">
|
||||
<double>1.000000000000000</double>
|
||||
</property>
|
||||
@ -47,13 +47,33 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="2" column="0">
|
||||
<item row="3" column="0">
|
||||
<widget class="QCheckBox" name="opt_no_images">
|
||||
<property name="text">
|
||||
<string>No Images</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="0" column="0">
|
||||
<widget class="QCheckBox" name="opt_remove_header">
|
||||
<property name="text">
|
||||
<string>Remove Header</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="1" column="0">
|
||||
<widget class="QCheckBox" name="opt_remove_footer">
|
||||
<property name="text">
|
||||
<string>Remove Footer</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="0" column="1">
|
||||
<widget class="QLineEdit" name="opt_header_regex"/>
|
||||
</item>
|
||||
<item row="1" column="1">
|
||||
<widget class="QLineEdit" name="opt_footer_regex"/>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
<resources/>
|
||||
|
Loading…
x
Reference in New Issue
Block a user