From aa1f52460e88fa1e2d5d0ff440844def06830445 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 1 Dec 2019 20:03:18 +0530 Subject: [PATCH] Function to remove surrounding punctuation --- src/calibre/utils/hyphenation/hyphenate.py | 25 +++++++++++++++++++ .../utils/hyphenation/test_hyphenation.py | 9 ++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/calibre/utils/hyphenation/hyphenate.py b/src/calibre/utils/hyphenation/hyphenate.py index 80b0730ff2..bc4dc447bb 100644 --- a/src/calibre/utils/hyphenation/hyphenate.py +++ b/src/calibre/utils/hyphenation/hyphenate.py @@ -6,3 +6,28 @@ from __future__ import absolute_import, division, print_function, unicode_litera # TODO: lower case word? remove trailing punctuation. abort early if contains = or length < 4 or length > 99 # TODO: test with replacement words + + +import regex +REGEX_FLAGS = regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.UNICODE + + +def pats(): + ans = getattr(pats, 'ans', None) + if ans is None: + pats.ans = ans = regex.compile(r'^\p{P}+', REGEX_FLAGS), regex.compile(r'\p{P}+$', REGEX_FLAGS) + return ans + + +def remove_punctuation(word): + leading, trailing = pats() + prefix = suffix = '' + nword, n = leading.subn('', word) + if n > 0: + count = len(word) - len(nword) + prefix, word = word[:count], nword + nword, n = trailing.subn('', word) + if n > 0: + count = len(word) - len(nword) + suffix, word = word[-count:], nword + return prefix, word, suffix diff --git a/src/calibre/utils/hyphenation/test_hyphenation.py b/src/calibre/utils/hyphenation/test_hyphenation.py index 1476083739..3795581691 100644 --- a/src/calibre/utils/hyphenation/test_hyphenation.py +++ b/src/calibre/utils/hyphenation/test_hyphenation.py @@ -4,13 +4,15 @@ from __future__ import absolute_import, division, print_function, unicode_literals -import shutil, os +import os +import shutil import unittest from calibre.ptempfile import PersistentTemporaryDirectory from calibre.utils.hyphenation.dictionaries import ( dictionary_name_for_locale, path_to_dictionary ) +from calibre.utils.hyphenation.hyphenate import remove_punctuation class TestHyphenation(unittest.TestCase): @@ -61,6 +63,11 @@ class TestHyphenation(unittest.TestCase): ) self.assertFalse(cache[0]) + def test_remove_punctuation(self): + self.ae(remove_punctuation('word'), ('', 'word', '')) + self.ae(remove_punctuation('wo.rd.'), ('', 'wo.rd', '.')) + self.ae(remove_punctuation('"«word!!'), ('"«', 'word', '!!')) + def find_tests(): return unittest.defaultTestLoader.loadTestsFromTestCase(TestHyphenation)