From bcfdfc1e1d6c899c1fb45edfba7f528c3ed2bbc6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 12 Nov 2024 20:38:57 +0530 Subject: [PATCH] And yet more crap with pykakasi --- setup/arch-ci.sh | 2 ++ src/calibre/ebooks/unihandecode/jadecoder.py | 26 +++++++++++++------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/setup/arch-ci.sh b/setup/arch-ci.sh index b41ff34ddc..1add54a965 100755 --- a/setup/arch-ci.sh +++ b/setup/arch-ci.sh @@ -7,6 +7,8 @@ set -xe pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder cmake chmlib icu jxrlib hunspell libmtp libusb libwmf optipng python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-jeepney python-dnspython python-feedparser python-html2text python-html5-parser python-lxml python-lxml-html-clean python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt6 python-regex python-zeroconf python-pyqt6-webengine qt6-svg qt6-imageformats qt6-speech udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip python-fonttools python-xxhash uchardet libstemmer poppler tk podofo +pip install --break-system-packages pykakasi + useradd -m ci chown -R ci:users $GITHUB_WORKSPACE diff --git a/src/calibre/ebooks/unihandecode/jadecoder.py b/src/calibre/ebooks/unihandecode/jadecoder.py index 985063c91c..4f654c0049 100644 --- a/src/calibre/ebooks/unihandecode/jadecoder.py +++ b/src/calibre/ebooks/unihandecode/jadecoder.py @@ -18,6 +18,7 @@ Copyright (c) 2010 Hiroshi Miura import pickle import re +import warnings from importlib.resources import files from pykakasi import kakasi @@ -63,18 +64,25 @@ class Jadecoder(Unidecoder): def __init__(self): self.codepoints = CODEPOINTS.copy() self.codepoints.update(JACODES) - self.kakasi = kakasi() - self.kakasi.setMode("H","a") # Hiragana to ascii, default: no conversion - self.kakasi.setMode("K","a") # Katakana to ascii, default: no conversion - self.kakasi.setMode("J","a") # Japanese to ascii, default: no conversion - self.kakasi.setMode("r","Hepburn") # default: use Hepburn Roman table - self.kakasi.setMode("s", True) # add space, default: no separator - self.kakasi.setMode("C", True) # capitalize, default: no capitalize - self.conv = self.kakasi.getConverter() + + # We have to use the deprecated API as the new API does not capitalize + # words. Sigh. + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + self.kakasi = kakasi() + self.kakasi.setMode("H","a") # Hiragana to ascii, default: no conversion + self.kakasi.setMode("K","a") # Katakana to ascii, default: no conversion + self.kakasi.setMode("J","a") # Japanese to ascii, default: no conversion + self.kakasi.setMode("r","Hepburn") # default: use Hepburn Roman table + self.kakasi.setMode("s", True) # add space, default: no separator + self.kakasi.setMode("C", True) # capitalize, default: no capitalize + self.conv = self.kakasi.getConverter() def decode(self, text): try: - text = self.conv.do(text) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + text = self.conv.do(text) except Exception: pass return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()), text)