Conversion: Use the same regular expression engine as is used by the Edit Book tool. The new engine has much better support for unicode characters/character classes.

This commit is contained in:
Kovid Goyal 2017-04-16 17:43:17 +05:30
parent cb77fecb5c
commit c6d46ceffa
5 changed files with 34 additions and 21 deletions

View File

@ -541,8 +541,9 @@ class HTMLPreProcessor(object):
# Function for processing search and replace # Function for processing search and replace
def do_search_replace(search_pattern, replace_txt): def do_search_replace(search_pattern, replace_txt):
from calibre.ebooks.conversion.search_replace import compile_regular_expression
try: try:
search_re = re.compile(search_pattern) search_re = compile_regular_expression(search_pattern)
if not replace_txt: if not replace_txt:
replace_txt = '' replace_txt = ''
rules.insert(0, (search_re, replace_txt)) rules.insert(0, (search_re, replace_txt))
@ -617,7 +618,7 @@ class HTMLPreProcessor(object):
for rule in rules + end_rules: for rule in rules + end_rules:
try: try:
html = rule[0].sub(rule[1], html) html = rule[0].sub(rule[1], html)
except re.error as e: except Exception as e:
if rule in user_sr_rules: if rule in user_sr_rules:
self.log.error( self.log.error(
'User supplied search & replace rule: %s -> %s ' 'User supplied search & replace rule: %s -> %s '
@ -678,5 +679,3 @@ class HTMLPreProcessor(object):
html = html.replace(char, asciichar) html = html.replace(char, asciichar)
return html return html

View File

@ -0,0 +1,19 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals
import regex
REGEX_FLAGS = regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.MULTILINE | regex.UNICODE
regex_cache = {}
def compile_regular_expression(text, flags=REGEX_FLAGS):
key = flags, text
ans = regex_cache.get(key)
if ans is None:
ans = regex_cache[key] = regex.compile(text, flags=flags)
return regex.compile(text, flags=flags)

View File

@ -4,7 +4,7 @@ __license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re, os import os
from PyQt5.Qt import (QDialog, QWidget, QDialogButtonBox, from PyQt5.Qt import (QDialog, QWidget, QDialogButtonBox,
QBrush, QTextCursor, QTextEdit, QByteArray, Qt, pyqtSignal) QBrush, QTextCursor, QTextEdit, QByteArray, Qt, pyqtSignal)
@ -15,6 +15,7 @@ from calibre.gui2 import error_dialog, choose_files, gprefs
from calibre.gui2.dialogs.choose_format import ChooseFormatDialog from calibre.gui2.dialogs.choose_format import ChooseFormatDialog
from calibre.constants import iswindows from calibre.constants import iswindows
from calibre.utils.ipc.simple_worker import fork_job, WorkerError from calibre.utils.ipc.simple_worker import fork_job, WorkerError
from calibre.ebooks.conversion.search_replace import compile_regular_expression
from calibre.ptempfile import TemporaryFile from calibre.ptempfile import TemporaryFile
@ -60,7 +61,7 @@ class RegexBuilder(QDialog, Ui_RegexBuilder):
regex = unicode(self.regex.text()) regex = unicode(self.regex.text())
if regex: if regex:
try: try:
re.compile(regex) compile_regular_expression(regex)
self.regex.setStyleSheet('QLineEdit { color: black; background-color: rgba(0,255,0,20%); }') self.regex.setStyleSheet('QLineEdit { color: black; background-color: rgba(0,255,0,20%); }')
return True return True
except: except:
@ -87,7 +88,7 @@ class RegexBuilder(QDialog, Ui_RegexBuilder):
extsel.cursor = cursor extsel.cursor = cursor
extsel.format.setBackground(QBrush(Qt.yellow)) extsel.format.setBackground(QBrush(Qt.yellow))
try: try:
for match in re.finditer(regex, text): for match in compile_regular_expression(regex).finditer(text):
es = QTextEdit.ExtraSelection(extsel) es = QTextEdit.ExtraSelection(extsel)
es.cursor.setPosition(match.start(), QTextCursor.MoveAnchor) es.cursor.setPosition(match.start(), QTextCursor.MoveAnchor)
es.cursor.setPosition(match.end(), QTextCursor.KeepAnchor) es.cursor.setPosition(match.end(), QTextCursor.KeepAnchor)

View File

@ -4,7 +4,7 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>, 2012 Eli Algranti <idea00@hotmail.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>, 2012 Eli Algranti <idea00@hotmail.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re, codecs, json import codecs, json
from PyQt5.Qt import Qt, QTableWidgetItem from PyQt5.Qt import Qt, QTableWidgetItem
@ -14,6 +14,7 @@ from calibre.gui2 import (error_dialog, question_dialog, choose_files,
choose_save_file) choose_save_file)
from calibre import as_unicode from calibre import as_unicode
from calibre.utils.localization import localize_user_manual_link from calibre.utils.localization import localize_user_manual_link
from calibre.ebooks.conversion.search_replace import compile_regular_expression
class SearchAndReplaceWidget(Widget, Ui_Form): class SearchAndReplaceWidget(Widget, Ui_Form):
@ -209,7 +210,7 @@ class SearchAndReplaceWidget(Widget, Ui_Form):
# Verify all search expressions are valid # Verify all search expressions are valid
for search, replace in definitions: for search, replace in definitions:
try: try:
re.compile(search) compile_regular_expression(search)
except Exception as err: except Exception as err:
error_dialog(self, _('Invalid regular expression'), error_dialog(self, _('Invalid regular expression'),
_('Invalid regular expression: %s')%err, show=True) _('Invalid regular expression: %s')%err, show=True)
@ -300,4 +301,3 @@ class SearchAndReplaceWidget(Widget, Ui_Form):
'to this conversion.') 'to this conversion.')
self.setup_widget_help(self.search_replace) self.setup_widget_help(self.search_replace)
return True return True

View File

@ -33,10 +33,9 @@ from calibre.gui2.tweak_book.function_replace import (
from calibre.gui2.tweak_book.widgets import BusyCursor from calibre.gui2.tweak_book.widgets import BusyCursor
from calibre.gui2.widgets2 import FlowLayout, HistoryComboBox from calibre.gui2.widgets2 import FlowLayout, HistoryComboBox
from calibre.utils.icu import primary_contains from calibre.utils.icu import primary_contains
from calibre.ebooks.conversion.search_replace import REGEX_FLAGS, compile_regular_expression
REGEX_FLAGS = regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.MULTILINE | regex.UNICODE
# The search panel {{{ # The search panel {{{
@ -454,9 +453,6 @@ class SearchWidget(QWidget):
# }}} # }}}
regex_cache = {}
class SearchPanel(QWidget): # {{{ class SearchPanel(QWidget): # {{{
search_triggered = pyqtSignal(object) search_triggered = pyqtSignal(object)
@ -1295,12 +1291,10 @@ def get_search_regex(state):
flags |= regex.DOTALL flags |= regex.DOTALL
if state['direction'] == 'up': if state['direction'] == 'up':
flags |= regex.REVERSE flags |= regex.REVERSE
ans = regex_cache.get((flags, raw), None) try:
if ans is None: ans = compile_regular_expression(raw, flags=flags)
try: except regex.error as e:
ans = regex_cache[(flags, raw)] = regex.compile(raw, flags=flags) raise InvalidRegex(raw, e)
except regex.error as e:
raise InvalidRegex(raw, e)
return ans return ans