Update the ISO 639-3 language database

This commit is contained in:
Kovid Goyal 2019-11-02 09:21:28 +05:30
parent 6cccd18be8
commit 1b93d540b7
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 48713 additions and 39214 deletions

48688
setup/iso_639-3.json Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -726,7 +726,7 @@ class ISO639(Command): # {{{
'iso639.calibre_msgpack') 'iso639.calibre_msgpack')
def run(self, opts): def run(self, opts):
src = self.j(self.d(self.SRC), 'setup', 'iso_639_3.xml') src = self.j(self.d(self.SRC), 'setup', 'iso_639-3.json')
if not os.path.exists(src): if not os.path.exists(src):
raise Exception(src + ' does not exist') raise Exception(src + ' does not exist')
dest = self.DEST dest = self.DEST
@ -737,29 +737,24 @@ class ISO639(Command): # {{{
self.info('Packed code is up to date') self.info('Packed code is up to date')
return return
self.info('Packing ISO-639 codes to', dest) self.info('Packing ISO-639 codes to', dest)
from lxml import etree with open(src, 'rb') as f:
root = etree.fromstring(open(src, 'rb').read()) root = json.load(f)
entries = root['639-3']
by_2 = {} by_2 = {}
by_3b = {} by_3 = {}
by_3t = {}
m2to3 = {} m2to3 = {}
m3to2 = {} m3to2 = {}
m3bto3t = {}
nm = {} nm = {}
codes2, codes3t, codes3b = set(), set(), set() codes2, codes3 = set(), set()
unicode_type = type(u'') unicode_type = type(u'')
for x in root.xpath('//iso_639_3_entry'): for x in entries:
two = x.get('part1_code', None) two = x.get('alpha_2')
if two: if two:
two = unicode_type(two) two = unicode_type(two)
threet = x.get('id') threeb = x.get('alpha_3')
if threet:
threet = unicode_type(threet)
threeb = x.get('part2_code', None)
if threeb: if threeb:
threeb = unicode_type(threeb) threeb = unicode_type(threeb)
if threeb is None: if threeb is None:
# Only recognize languages in ISO-639-2
continue continue
name = x.get('name') name = x.get('name')
if name: if name:
@ -768,20 +763,16 @@ class ISO639(Command): # {{{
if two is not None: if two is not None:
by_2[two] = name by_2[two] = name
codes2.add(two) codes2.add(two)
m2to3[two] = threet m2to3[two] = threeb
m3to2[threeb] = m3to2[threet] = two m3to2[threeb] = two
by_3b[threeb] = name codes3.add(threeb)
by_3t[threet] = name by_3[threeb] = name
if threeb != threet:
m3bto3t[threeb] = threet
codes3b.add(threeb)
codes3t.add(threet)
base_name = name.lower() base_name = name.lower()
nm[base_name] = threet nm[base_name] = threeb
x = {u'by_2':by_2, u'by_3b':by_3b, u'by_3t':by_3t, u'codes2':codes2, x = {u'by_2':by_2, u'by_3':by_3, u'codes2':codes2,
u'codes3b':codes3b, u'codes3t':codes3t, u'2to3':m2to3, u'codes3':codes3, u'2to3':m2to3,
u'3to2':m3to2, u'3bto3t':m3bto3t, u'name_map':nm} u'3to2':m3to2, u'name_map':nm}
from calibre.utils.serialize import msgpack_dumps from calibre.utils.serialize import msgpack_dumps
with open(dest, 'wb') as f: with open(dest, 'wb') as f:
f.write(msgpack_dumps(x)) f.write(msgpack_dumps(x))

View File

@ -369,6 +369,8 @@ def _load_iso639():
ip = P('localization/iso639.calibre_msgpack', allow_user_override=False, data=True) ip = P('localization/iso639.calibre_msgpack', allow_user_override=False, data=True)
from calibre.utils.serialize import msgpack_loads from calibre.utils.serialize import msgpack_loads
_iso639 = msgpack_loads(ip) _iso639 = msgpack_loads(ip)
if 'by_3' not in _iso639:
_iso639['by_3'] = _iso639['by_3t']
return _iso639 return _iso639
@ -379,10 +381,8 @@ def get_iso_language(lang_trans, lang):
if len(lang) == 2: if len(lang) == 2:
ans = iso639['by_2'].get(lang, ans) ans = iso639['by_2'].get(lang, ans)
elif len(lang) == 3: elif len(lang) == 3:
if lang in iso639['by_3b']: if lang in iso639['by_3']:
ans = iso639['by_3b'][lang] ans = iso639['by_3'][lang]
else:
ans = iso639['by_3t'].get(lang, ans)
return lang_trans(ans) return lang_trans(ans)
@ -401,7 +401,7 @@ def calibre_langcode_to_name(lc, localize=True):
iso639 = _load_iso639() iso639 = _load_iso639()
translate = _ if localize else lambda x: x translate = _ if localize else lambda x: x
try: try:
return translate(iso639['by_3t'][lc]) return translate(iso639['by_3'][lc])
except: except:
pass pass
return lc return lc
@ -426,10 +426,8 @@ def canonicalize_lang(raw):
if ans is not None: if ans is not None:
return ans return ans
elif len(raw) == 3: elif len(raw) == 3:
if raw in iso639['by_3t']: if raw in iso639['by_3']:
return raw return raw
if raw in iso639['3bto3t']:
return iso639['3bto3t'][raw]
return iso639['name_map'].get(raw, None) return iso639['name_map'].get(raw, None)
@ -443,7 +441,7 @@ def lang_map():
translate = _ translate = _
global _lang_map global _lang_map
if _lang_map is None: if _lang_map is None:
_lang_map = {k:translate(v) for k, v in iteritems(iso639['by_3t'])} _lang_map = {k:translate(v) for k, v in iteritems(iso639['by_3'])}
return _lang_map return _lang_map
@ -467,7 +465,7 @@ def langnames_to_langcodes(names):
translate = _ translate = _
ans = {} ans = {}
names = set(names) names = set(names)
for k, v in iteritems(iso639['by_3t']): for k, v in iteritems(iso639['by_3']):
tv = translate(v) tv = translate(v)
if tv in names: if tv in names:
names.remove(tv) names.remove(tv)