And yet more crap with pykakasi

This commit is contained in:
Kovid Goyal 2024-11-12 20:38:57 +05:30
parent 04d5728ef9
commit bcfdfc1e1d
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 19 additions and 9 deletions

View File

@ -7,6 +7,8 @@ set -xe
pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder cmake chmlib icu jxrlib hunspell libmtp libusb libwmf optipng python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-jeepney python-dnspython python-feedparser python-html2text python-html5-parser python-lxml python-lxml-html-clean python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt6 python-regex python-zeroconf python-pyqt6-webengine qt6-svg qt6-imageformats qt6-speech udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip python-fonttools python-xxhash uchardet libstemmer poppler tk podofo pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder cmake chmlib icu jxrlib hunspell libmtp libusb libwmf optipng python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-jeepney python-dnspython python-feedparser python-html2text python-html5-parser python-lxml python-lxml-html-clean python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt6 python-regex python-zeroconf python-pyqt6-webengine qt6-svg qt6-imageformats qt6-speech udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip python-fonttools python-xxhash uchardet libstemmer poppler tk podofo
pip install --break-system-packages pykakasi
useradd -m ci useradd -m ci
chown -R ci:users $GITHUB_WORKSPACE chown -R ci:users $GITHUB_WORKSPACE

View File

@ -18,6 +18,7 @@ Copyright (c) 2010 Hiroshi Miura
import pickle import pickle
import re import re
import warnings
from importlib.resources import files from importlib.resources import files
from pykakasi import kakasi from pykakasi import kakasi
@ -63,18 +64,25 @@ class Jadecoder(Unidecoder):
def __init__(self): def __init__(self):
self.codepoints = CODEPOINTS.copy() self.codepoints = CODEPOINTS.copy()
self.codepoints.update(JACODES) self.codepoints.update(JACODES)
self.kakasi = kakasi()
self.kakasi.setMode("H","a") # Hiragana to ascii, default: no conversion # We have to use the deprecated API as the new API does not capitalize
self.kakasi.setMode("K","a") # Katakana to ascii, default: no conversion # words. Sigh.
self.kakasi.setMode("J","a") # Japanese to ascii, default: no conversion with warnings.catch_warnings():
self.kakasi.setMode("r","Hepburn") # default: use Hepburn Roman table warnings.simplefilter("ignore")
self.kakasi.setMode("s", True) # add space, default: no separator self.kakasi = kakasi()
self.kakasi.setMode("C", True) # capitalize, default: no capitalize self.kakasi.setMode("H","a") # Hiragana to ascii, default: no conversion
self.conv = self.kakasi.getConverter() self.kakasi.setMode("K","a") # Katakana to ascii, default: no conversion
self.kakasi.setMode("J","a") # Japanese to ascii, default: no conversion
self.kakasi.setMode("r","Hepburn") # default: use Hepburn Roman table
self.kakasi.setMode("s", True) # add space, default: no separator
self.kakasi.setMode("C", True) # capitalize, default: no capitalize
self.conv = self.kakasi.getConverter()
def decode(self, text): def decode(self, text):
try: try:
text = self.conv.do(text) with warnings.catch_warnings():
warnings.simplefilter("ignore")
text = self.conv.do(text)
except Exception: except Exception:
pass pass
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()), text) return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()), text)