And yet more crap with pykakasi

This commit is contained in:
Kovid Goyal 2024-11-12 20:38:57 +05:30
parent 04d5728ef9
commit bcfdfc1e1d
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 19 additions and 9 deletions

View File

@ -7,6 +7,8 @@ set -xe
pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder cmake chmlib icu jxrlib hunspell libmtp libusb libwmf optipng python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-jeepney python-dnspython python-feedparser python-html2text python-html5-parser python-lxml python-lxml-html-clean python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt6 python-regex python-zeroconf python-pyqt6-webengine qt6-svg qt6-imageformats qt6-speech udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip python-fonttools python-xxhash uchardet libstemmer poppler tk podofo
pip install --break-system-packages pykakasi
useradd -m ci
chown -R ci:users $GITHUB_WORKSPACE

View File

@ -18,6 +18,7 @@ Copyright (c) 2010 Hiroshi Miura
import pickle
import re
import warnings
from importlib.resources import files
from pykakasi import kakasi
@ -63,18 +64,25 @@ class Jadecoder(Unidecoder):
def __init__(self):
self.codepoints = CODEPOINTS.copy()
self.codepoints.update(JACODES)
self.kakasi = kakasi()
self.kakasi.setMode("H","a") # Hiragana to ascii, default: no conversion
self.kakasi.setMode("K","a") # Katakana to ascii, default: no conversion
self.kakasi.setMode("J","a") # Japanese to ascii, default: no conversion
self.kakasi.setMode("r","Hepburn") # default: use Hepburn Roman table
self.kakasi.setMode("s", True) # add space, default: no separator
self.kakasi.setMode("C", True) # capitalize, default: no capitalize
self.conv = self.kakasi.getConverter()
# We have to use the deprecated API as the new API does not capitalize
# words. Sigh.
with warnings.catch_warnings():
warnings.simplefilter("ignore")
self.kakasi = kakasi()
self.kakasi.setMode("H","a") # Hiragana to ascii, default: no conversion
self.kakasi.setMode("K","a") # Katakana to ascii, default: no conversion
self.kakasi.setMode("J","a") # Japanese to ascii, default: no conversion
self.kakasi.setMode("r","Hepburn") # default: use Hepburn Roman table
self.kakasi.setMode("s", True) # add space, default: no separator
self.kakasi.setMode("C", True) # capitalize, default: no capitalize
self.conv = self.kakasi.getConverter()
def decode(self, text):
try:
text = self.conv.do(text)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
text = self.conv.do(text)
except Exception:
pass
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()), text)