From c6d46ceffa2de6fbb828655d7f0a9a34739a558a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 16 Apr 2017 17:43:17 +0530 Subject: [PATCH] Conversion: Use the same regular expression engine as is used by the Edit Book tool. The new engine has much better support for unicode characters/character classes. --- src/calibre/ebooks/conversion/preprocess.py | 7 +++---- .../ebooks/conversion/search_replace.py | 19 +++++++++++++++++++ src/calibre/gui2/convert/regex_builder.py | 7 ++++--- .../gui2/convert/search_and_replace.py | 6 +++--- src/calibre/gui2/tweak_book/search.py | 16 +++++----------- 5 files changed, 34 insertions(+), 21 deletions(-) create mode 100644 src/calibre/ebooks/conversion/search_replace.py diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index eba44661eb..92864f8e6f 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -541,8 +541,9 @@ class HTMLPreProcessor(object): # Function for processing search and replace def do_search_replace(search_pattern, replace_txt): + from calibre.ebooks.conversion.search_replace import compile_regular_expression try: - search_re = re.compile(search_pattern) + search_re = compile_regular_expression(search_pattern) if not replace_txt: replace_txt = '' rules.insert(0, (search_re, replace_txt)) @@ -617,7 +618,7 @@ class HTMLPreProcessor(object): for rule in rules + end_rules: try: html = rule[0].sub(rule[1], html) - except re.error as e: + except Exception as e: if rule in user_sr_rules: self.log.error( 'User supplied search & replace rule: %s -> %s ' @@ -678,5 +679,3 @@ class HTMLPreProcessor(object): html = html.replace(char, asciichar) return html - - diff --git a/src/calibre/ebooks/conversion/search_replace.py b/src/calibre/ebooks/conversion/search_replace.py new file mode 100644 index 0000000000..a43d4e9193 --- /dev/null +++ b/src/calibre/ebooks/conversion/search_replace.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2017, Kovid Goyal + +from __future__ import absolute_import, division, print_function, unicode_literals + +import regex + +REGEX_FLAGS = regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.MULTILINE | regex.UNICODE + +regex_cache = {} + + +def compile_regular_expression(text, flags=REGEX_FLAGS): + key = flags, text + ans = regex_cache.get(key) + if ans is None: + ans = regex_cache[key] = regex.compile(text, flags=flags) + return regex.compile(text, flags=flags) diff --git a/src/calibre/gui2/convert/regex_builder.py b/src/calibre/gui2/convert/regex_builder.py index 6c3043e822..c530f34f3b 100644 --- a/src/calibre/gui2/convert/regex_builder.py +++ b/src/calibre/gui2/convert/regex_builder.py @@ -4,7 +4,7 @@ __license__ = 'GPL 3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import re, os +import os from PyQt5.Qt import (QDialog, QWidget, QDialogButtonBox, QBrush, QTextCursor, QTextEdit, QByteArray, Qt, pyqtSignal) @@ -15,6 +15,7 @@ from calibre.gui2 import error_dialog, choose_files, gprefs from calibre.gui2.dialogs.choose_format import ChooseFormatDialog from calibre.constants import iswindows from calibre.utils.ipc.simple_worker import fork_job, WorkerError +from calibre.ebooks.conversion.search_replace import compile_regular_expression from calibre.ptempfile import TemporaryFile @@ -60,7 +61,7 @@ class RegexBuilder(QDialog, Ui_RegexBuilder): regex = unicode(self.regex.text()) if regex: try: - re.compile(regex) + compile_regular_expression(regex) self.regex.setStyleSheet('QLineEdit { color: black; background-color: rgba(0,255,0,20%); }') return True except: @@ -87,7 +88,7 @@ class RegexBuilder(QDialog, Ui_RegexBuilder): extsel.cursor = cursor extsel.format.setBackground(QBrush(Qt.yellow)) try: - for match in re.finditer(regex, text): + for match in compile_regular_expression(regex).finditer(text): es = QTextEdit.ExtraSelection(extsel) es.cursor.setPosition(match.start(), QTextCursor.MoveAnchor) es.cursor.setPosition(match.end(), QTextCursor.KeepAnchor) diff --git a/src/calibre/gui2/convert/search_and_replace.py b/src/calibre/gui2/convert/search_and_replace.py index 86dc1ec2e9..ad2253211b 100644 --- a/src/calibre/gui2/convert/search_and_replace.py +++ b/src/calibre/gui2/convert/search_and_replace.py @@ -4,7 +4,7 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember , 2012 Eli Algranti ' __docformat__ = 'restructuredtext en' -import re, codecs, json +import codecs, json from PyQt5.Qt import Qt, QTableWidgetItem @@ -14,6 +14,7 @@ from calibre.gui2 import (error_dialog, question_dialog, choose_files, choose_save_file) from calibre import as_unicode from calibre.utils.localization import localize_user_manual_link +from calibre.ebooks.conversion.search_replace import compile_regular_expression class SearchAndReplaceWidget(Widget, Ui_Form): @@ -209,7 +210,7 @@ class SearchAndReplaceWidget(Widget, Ui_Form): # Verify all search expressions are valid for search, replace in definitions: try: - re.compile(search) + compile_regular_expression(search) except Exception as err: error_dialog(self, _('Invalid regular expression'), _('Invalid regular expression: %s')%err, show=True) @@ -300,4 +301,3 @@ class SearchAndReplaceWidget(Widget, Ui_Form): 'to this conversion.') self.setup_widget_help(self.search_replace) return True - diff --git a/src/calibre/gui2/tweak_book/search.py b/src/calibre/gui2/tweak_book/search.py index f8b8c20251..2a3ca17f5d 100644 --- a/src/calibre/gui2/tweak_book/search.py +++ b/src/calibre/gui2/tweak_book/search.py @@ -33,10 +33,9 @@ from calibre.gui2.tweak_book.function_replace import ( from calibre.gui2.tweak_book.widgets import BusyCursor from calibre.gui2.widgets2 import FlowLayout, HistoryComboBox from calibre.utils.icu import primary_contains +from calibre.ebooks.conversion.search_replace import REGEX_FLAGS, compile_regular_expression -REGEX_FLAGS = regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.MULTILINE | regex.UNICODE - # The search panel {{{ @@ -454,9 +453,6 @@ class SearchWidget(QWidget): # }}} -regex_cache = {} - - class SearchPanel(QWidget): # {{{ search_triggered = pyqtSignal(object) @@ -1295,12 +1291,10 @@ def get_search_regex(state): flags |= regex.DOTALL if state['direction'] == 'up': flags |= regex.REVERSE - ans = regex_cache.get((flags, raw), None) - if ans is None: - try: - ans = regex_cache[(flags, raw)] = regex.compile(raw, flags=flags) - except regex.error as e: - raise InvalidRegex(raw, e) + try: + ans = compile_regular_expression(raw, flags=flags) + except regex.error as e: + raise InvalidRegex(raw, e) return ans