mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
Header and footer removal by regex moved to from pdf input to structure detection in plumber.
This commit is contained in:
parent
21140bc72b
commit
794eba4b46
@ -315,6 +315,31 @@ OptionRecommendation(name='preprocess_html',
|
|||||||
)
|
)
|
||||||
),
|
),
|
||||||
|
|
||||||
|
OptionRecommendation(name='remove_header',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Use a regular expression to try and remove the header.'
|
||||||
|
)
|
||||||
|
),
|
||||||
|
|
||||||
|
OptionRecommendation(name='header_regex',
|
||||||
|
recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
|
||||||
|
level=OptionRecommendation.LOW,
|
||||||
|
help=_('The regular expression to use to remove the header.'
|
||||||
|
)
|
||||||
|
),
|
||||||
|
|
||||||
|
OptionRecommendation(name='remove_footer',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Use a regular expression to try and remove the footer.'
|
||||||
|
)
|
||||||
|
),
|
||||||
|
|
||||||
|
OptionRecommendation(name='footer_regex',
|
||||||
|
recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
|
||||||
|
level=OptionRecommendation.LOW,
|
||||||
|
help=_('The regular expression to use to remove the footer.'
|
||||||
|
)
|
||||||
|
),
|
||||||
|
|
||||||
OptionRecommendation(name='read_metadata_from_opf',
|
OptionRecommendation(name='read_metadata_from_opf',
|
||||||
recommended_value=None, level=OptionRecommendation.LOW,
|
recommended_value=None, level=OptionRecommendation.LOW,
|
||||||
|
@ -185,17 +185,7 @@ class HTMLPreProcessor(object):
|
|||||||
elif self.is_book_designer(html):
|
elif self.is_book_designer(html):
|
||||||
rules = self.BOOK_DESIGNER
|
rules = self.BOOK_DESIGNER
|
||||||
elif self.is_pdftohtml(html):
|
elif self.is_pdftohtml(html):
|
||||||
start_rules = []
|
|
||||||
end_rules = []
|
end_rules = []
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'remove_header', None):
|
|
||||||
start_rules.append(
|
|
||||||
(re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '')
|
|
||||||
)
|
|
||||||
if getattr(self.extra_opts, 'remove_footer', None):
|
|
||||||
start_rules.append(
|
|
||||||
(re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '')
|
|
||||||
)
|
|
||||||
if getattr(self.extra_opts, 'unwrap_factor', None):
|
if getattr(self.extra_opts, 'unwrap_factor', None):
|
||||||
length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
|
length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
|
||||||
if length:
|
if length:
|
||||||
@ -204,10 +194,21 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
|
(re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
|
||||||
)
|
)
|
||||||
|
|
||||||
rules = start_rules + self.PDFTOHTML + end_rules
|
rules = self.PDFTOHTML + end_rules
|
||||||
else:
|
else:
|
||||||
rules = []
|
rules = []
|
||||||
for rule in self.PREPROCESS + rules:
|
|
||||||
|
pre_rules = []
|
||||||
|
if getattr(self.extra_opts, 'remove_header', None):
|
||||||
|
pre_rules.append(
|
||||||
|
(re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '')
|
||||||
|
)
|
||||||
|
if getattr(self.extra_opts, 'remove_footer', None):
|
||||||
|
pre_rules.append(
|
||||||
|
(re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '')
|
||||||
|
)
|
||||||
|
|
||||||
|
for rule in self.PREPROCESS + pre_rules + rules:
|
||||||
html = rule[0].sub(rule[1], html)
|
html = rule[0].sub(rule[1], html)
|
||||||
|
|
||||||
# Handle broken XHTML w/ SVG (ugh)
|
# Handle broken XHTML w/ SVG (ugh)
|
||||||
|
@ -24,16 +24,6 @@ class PDFInput(InputFormatPlugin):
|
|||||||
help=_('Scale used to determine the length at which a line should '
|
help=_('Scale used to determine the length at which a line should '
|
||||||
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
||||||
'default is 0.5, this is the median line length.')),
|
'default is 0.5, this is the median line length.')),
|
||||||
OptionRecommendation(name='remove_header', recommended_value=False,
|
|
||||||
help=_('Use a regular expression to try and remove the header.')),
|
|
||||||
OptionRecommendation(name='header_regex',
|
|
||||||
recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
|
|
||||||
help=_('The regular expression to use to remove the header.')),
|
|
||||||
OptionRecommendation(name='remove_footer', recommended_value=False,
|
|
||||||
help=_('Use a regular expression to try and remove the footer.')),
|
|
||||||
OptionRecommendation(name='footer_regex',
|
|
||||||
recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
|
|
||||||
help=_('The regular expression to use to remove the footer.')),
|
|
||||||
])
|
])
|
||||||
|
|
||||||
def convert(self, stream, options, file_ext, log,
|
def convert(self, stream, options, file_ext, log,
|
||||||
|
@ -4,13 +4,8 @@ __license__ = 'GPL 3'
|
|||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from PyQt4.Qt import SIGNAL
|
|
||||||
|
|
||||||
from calibre.gui2.convert.pdf_input_ui import Ui_Form
|
from calibre.gui2.convert.pdf_input_ui import Ui_Form
|
||||||
from calibre.gui2.convert import Widget
|
from calibre.gui2.convert import Widget
|
||||||
from calibre.gui2 import qstring_to_unicode, error_dialog
|
|
||||||
|
|
||||||
class PluginWidget(Widget, Ui_Form):
|
class PluginWidget(Widget, Ui_Form):
|
||||||
|
|
||||||
@ -19,31 +14,6 @@ class PluginWidget(Widget, Ui_Form):
|
|||||||
|
|
||||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||||
Widget.__init__(self, parent, 'pdf_input',
|
Widget.__init__(self, parent, 'pdf_input',
|
||||||
['no_images', 'unwrap_factor', 'remove_header', 'header_regex',
|
['no_images', 'unwrap_factor'])
|
||||||
'remove_footer', 'footer_regex'])
|
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
self.initialize_options(get_option, get_help, db, book_id)
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
|
|
||||||
self.opt_header_regex.setEnabled(self.opt_remove_header.isChecked())
|
|
||||||
self.opt_footer_regex.setEnabled(self.opt_remove_footer.isChecked())
|
|
||||||
|
|
||||||
self.connect(self.opt_remove_header, SIGNAL('stateChanged(int)'), self.header_regex_state)
|
|
||||||
self.connect(self.opt_remove_footer, SIGNAL('stateChanged(int)'), self.footer_regex_state)
|
|
||||||
|
|
||||||
def header_regex_state(self, state):
|
|
||||||
self.opt_header_regex.setEnabled(state)
|
|
||||||
|
|
||||||
def footer_regex_state(self, state):
|
|
||||||
self.opt_footer_regex.setEnabled(state)
|
|
||||||
|
|
||||||
def pre_commit_check(self):
|
|
||||||
for x in ('header_regex', 'footer_regex'):
|
|
||||||
x = getattr(self, 'opt_'+x)
|
|
||||||
try:
|
|
||||||
pat = qstring_to_unicode(x.text())
|
|
||||||
re.compile(pat)
|
|
||||||
except Exception, err:
|
|
||||||
error_dialog(self, _('Invalid regular expression'),
|
|
||||||
_('Invalid regular expression: %s')%err).exec_()
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
@ -14,14 +14,14 @@
|
|||||||
<string>Form</string>
|
<string>Form</string>
|
||||||
</property>
|
</property>
|
||||||
<layout class="QGridLayout" name="gridLayout">
|
<layout class="QGridLayout" name="gridLayout">
|
||||||
<item row="2" column="0">
|
<item row="0" column="0">
|
||||||
<widget class="QLabel" name="label_2">
|
<widget class="QLabel" name="label_2">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>Line Un-Wrapping Factor:</string>
|
<string>Line Un-Wrapping Factor:</string>
|
||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="4" column="0">
|
<item row="2" column="0">
|
||||||
<spacer name="verticalSpacer">
|
<spacer name="verticalSpacer">
|
||||||
<property name="orientation">
|
<property name="orientation">
|
||||||
<enum>Qt::Vertical</enum>
|
<enum>Qt::Vertical</enum>
|
||||||
@ -34,7 +34,7 @@
|
|||||||
</property>
|
</property>
|
||||||
</spacer>
|
</spacer>
|
||||||
</item>
|
</item>
|
||||||
<item row="2" column="1">
|
<item row="0" column="1">
|
||||||
<widget class="QDoubleSpinBox" name="opt_unwrap_factor">
|
<widget class="QDoubleSpinBox" name="opt_unwrap_factor">
|
||||||
<property name="maximum">
|
<property name="maximum">
|
||||||
<double>1.000000000000000</double>
|
<double>1.000000000000000</double>
|
||||||
@ -47,33 +47,13 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="3" column="0">
|
<item row="1" column="0">
|
||||||
<widget class="QCheckBox" name="opt_no_images">
|
<widget class="QCheckBox" name="opt_no_images">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>No Images</string>
|
<string>No Images</string>
|
||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="0" column="0">
|
|
||||||
<widget class="QCheckBox" name="opt_remove_header">
|
|
||||||
<property name="text">
|
|
||||||
<string>Remove Header</string>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="1" column="0">
|
|
||||||
<widget class="QCheckBox" name="opt_remove_footer">
|
|
||||||
<property name="text">
|
|
||||||
<string>Remove Footer</string>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="0" column="1">
|
|
||||||
<widget class="QLineEdit" name="opt_header_regex"/>
|
|
||||||
</item>
|
|
||||||
<item row="1" column="1">
|
|
||||||
<widget class="QLineEdit" name="opt_footer_regex"/>
|
|
||||||
</item>
|
|
||||||
</layout>
|
</layout>
|
||||||
</widget>
|
</widget>
|
||||||
<resources/>
|
<resources/>
|
||||||
|
@ -6,10 +6,13 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from PyQt4.Qt import SIGNAL
|
||||||
|
|
||||||
from calibre.gui2.convert.structure_detection_ui import Ui_Form
|
from calibre.gui2.convert.structure_detection_ui import Ui_Form
|
||||||
from calibre.gui2.convert import Widget
|
from calibre.gui2.convert import Widget
|
||||||
from calibre.gui2 import error_dialog
|
from calibre.gui2 import error_dialog, qstring_to_unicode
|
||||||
|
|
||||||
class StructureDetectionWidget(Widget, Ui_Form):
|
class StructureDetectionWidget(Widget, Ui_Form):
|
||||||
|
|
||||||
@ -23,7 +26,8 @@ class StructureDetectionWidget(Widget, Ui_Form):
|
|||||||
['chapter', 'chapter_mark',
|
['chapter', 'chapter_mark',
|
||||||
'remove_first_image',
|
'remove_first_image',
|
||||||
'insert_metadata', 'page_breaks_before',
|
'insert_metadata', 'page_breaks_before',
|
||||||
'preprocess_html']
|
'preprocess_html', 'remove_header', 'header_regex',
|
||||||
|
'remove_footer', 'footer_regex']
|
||||||
)
|
)
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
self.initialize_options(get_option, get_help, db, book_id)
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
@ -31,8 +35,16 @@ class StructureDetectionWidget(Widget, Ui_Form):
|
|||||||
self.opt_page_breaks_before.set_msg(_('Insert page breaks before '
|
self.opt_page_breaks_before.set_msg(_('Insert page breaks before '
|
||||||
'(XPath expression):'))
|
'(XPath expression):'))
|
||||||
|
|
||||||
|
|
||||||
def pre_commit_check(self):
|
def pre_commit_check(self):
|
||||||
|
for x in ('header_regex', 'footer_regex'):
|
||||||
|
x = getattr(self, 'opt_'+x)
|
||||||
|
try:
|
||||||
|
pat = qstring_to_unicode(x.text())
|
||||||
|
re.compile(pat)
|
||||||
|
except Exception, err:
|
||||||
|
error_dialog(self, _('Invalid regular expression'),
|
||||||
|
_('Invalid regular expression: %s')%err).exec_()
|
||||||
|
return False
|
||||||
for x in ('chapter', 'page_breaks_before'):
|
for x in ('chapter', 'page_breaks_before'):
|
||||||
x = getattr(self, 'opt_'+x)
|
x = getattr(self, 'opt_'+x)
|
||||||
if not x.check():
|
if not x.check():
|
||||||
|
@ -14,6 +14,9 @@
|
|||||||
<string>Form</string>
|
<string>Form</string>
|
||||||
</property>
|
</property>
|
||||||
<layout class="QGridLayout" name="gridLayout">
|
<layout class="QGridLayout" name="gridLayout">
|
||||||
|
<item row="0" column="0" colspan="2">
|
||||||
|
<widget class="XPathEdit" name="opt_chapter" native="true"/>
|
||||||
|
</item>
|
||||||
<item row="1" column="0">
|
<item row="1" column="0">
|
||||||
<widget class="QLabel" name="label">
|
<widget class="QLabel" name="label">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
@ -62,20 +65,27 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="4" column="0" colspan="2">
|
<item row="8" column="0">
|
||||||
|
<widget class="QLabel" name="label_3">
|
||||||
|
<property name="text">
|
||||||
|
<string>&Footer regular expression:</string>
|
||||||
|
</property>
|
||||||
|
<property name="buddy">
|
||||||
|
<cstring>opt_footer_regex</cstring>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="10" column="0" colspan="2">
|
||||||
<widget class="QCheckBox" name="opt_preprocess_html">
|
<widget class="QCheckBox" name="opt_preprocess_html">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>&Preprocess input file to possibly improve structure detection</string>
|
<string>&Preprocess input file to possibly improve structure detection</string>
|
||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="5" column="0" colspan="2">
|
<item row="11" column="0" colspan="2">
|
||||||
<widget class="XPathEdit" name="opt_page_breaks_before" native="true"/>
|
<widget class="XPathEdit" name="opt_page_breaks_before" native="true"/>
|
||||||
</item>
|
</item>
|
||||||
<item row="0" column="0" colspan="2">
|
<item row="12" column="0">
|
||||||
<widget class="XPathEdit" name="opt_chapter" native="true"/>
|
|
||||||
</item>
|
|
||||||
<item row="6" column="0">
|
|
||||||
<spacer name="verticalSpacer">
|
<spacer name="verticalSpacer">
|
||||||
<property name="orientation">
|
<property name="orientation">
|
||||||
<enum>Qt::Vertical</enum>
|
<enum>Qt::Vertical</enum>
|
||||||
@ -88,6 +98,36 @@
|
|||||||
</property>
|
</property>
|
||||||
</spacer>
|
</spacer>
|
||||||
</item>
|
</item>
|
||||||
|
<item row="5" column="0">
|
||||||
|
<widget class="QLabel" name="label_2">
|
||||||
|
<property name="text">
|
||||||
|
<string>&Header regular expression:</string>
|
||||||
|
</property>
|
||||||
|
<property name="buddy">
|
||||||
|
<cstring>opt_header_regex</cstring>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="7" column="0">
|
||||||
|
<widget class="QCheckBox" name="opt_remove_footer">
|
||||||
|
<property name="text">
|
||||||
|
<string>Remove F&ooter</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="4" column="0">
|
||||||
|
<widget class="QCheckBox" name="opt_remove_header">
|
||||||
|
<property name="text">
|
||||||
|
<string>Remove H&eader</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="9" column="0" colspan="2">
|
||||||
|
<widget class="QLineEdit" name="opt_footer_regex"/>
|
||||||
|
</item>
|
||||||
|
<item row="6" column="0" colspan="2">
|
||||||
|
<widget class="QLineEdit" name="opt_header_regex"/>
|
||||||
|
</item>
|
||||||
</layout>
|
</layout>
|
||||||
</widget>
|
</widget>
|
||||||
<customwidgets>
|
<customwidgets>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user