Function to remove surrounding punctuation

This commit is contained in:
Kovid Goyal 2019-12-01 20:03:18 +05:30
parent c344ef0775
commit aa1f52460e
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 33 additions and 1 deletions

View File

@ -6,3 +6,28 @@ from __future__ import absolute_import, division, print_function, unicode_litera
# TODO: lower case word? remove trailing punctuation. abort early if contains = or length < 4 or length > 99 # TODO: lower case word? remove trailing punctuation. abort early if contains = or length < 4 or length > 99
# TODO: test with replacement words # TODO: test with replacement words
import regex
REGEX_FLAGS = regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.UNICODE
def pats():
ans = getattr(pats, 'ans', None)
if ans is None:
pats.ans = ans = regex.compile(r'^\p{P}+', REGEX_FLAGS), regex.compile(r'\p{P}+$', REGEX_FLAGS)
return ans
def remove_punctuation(word):
leading, trailing = pats()
prefix = suffix = ''
nword, n = leading.subn('', word)
if n > 0:
count = len(word) - len(nword)
prefix, word = word[:count], nword
nword, n = trailing.subn('', word)
if n > 0:
count = len(word) - len(nword)
suffix, word = word[-count:], nword
return prefix, word, suffix

View File

@ -4,13 +4,15 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import shutil, os import os
import shutil
import unittest import unittest
from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.hyphenation.dictionaries import ( from calibre.utils.hyphenation.dictionaries import (
dictionary_name_for_locale, path_to_dictionary dictionary_name_for_locale, path_to_dictionary
) )
from calibre.utils.hyphenation.hyphenate import remove_punctuation
class TestHyphenation(unittest.TestCase): class TestHyphenation(unittest.TestCase):
@ -61,6 +63,11 @@ class TestHyphenation(unittest.TestCase):
) )
self.assertFalse(cache[0]) self.assertFalse(cache[0])
def test_remove_punctuation(self):
self.ae(remove_punctuation('word'), ('', 'word', ''))
self.ae(remove_punctuation('wo.rd.'), ('', 'wo.rd', '.'))
self.ae(remove_punctuation('"«word!!'), ('"«', 'word', '!!'))
def find_tests(): def find_tests():
return unittest.defaultTestLoader.loadTestsFromTestCase(TestHyphenation) return unittest.defaultTestLoader.loadTestsFromTestCase(TestHyphenation)