mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Use the ICU break iterator to split text into words
This commit is contained in:
parent
b12b830839
commit
66958760fe
@ -9,6 +9,7 @@ __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
import sys
|
import sys
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from calibre.spell.break_iterator import has_break_iterator, split_into_words
|
||||||
from calibre.spell.dictionary import parse_lang_code
|
from calibre.spell.dictionary import parse_lang_code
|
||||||
from calibre.ebooks.oeb.base import barename
|
from calibre.ebooks.oeb.base import barename
|
||||||
from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES, get_container
|
from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES, get_container
|
||||||
@ -38,10 +39,10 @@ def patterns():
|
|||||||
|
|
||||||
class Location(object):
|
class Location(object):
|
||||||
|
|
||||||
__slots__ = ('file_name', 'sourceline')
|
__slots__ = ('file_name', 'sourceline', 'original_word')
|
||||||
|
|
||||||
def __init__(self, file_name=None, sourceline=None):
|
def __init__(self, file_name=None, sourceline=None, original_word=None):
|
||||||
self.file_name, self.sourceline = file_name, sourceline
|
self.file_name, self.sourceline, self.original_word = file_name, sourceline, original_word
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '%s:%s' % (self.file_name, self.sourceline)
|
return '%s:%s' % (self.file_name, self.sourceline)
|
||||||
@ -55,45 +56,55 @@ def filter_words(word):
|
|||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def get_words(text):
|
if has_break_iterator:
|
||||||
p = patterns()
|
def get_words(text, lang):
|
||||||
text = p.sanitize_invisible_pat.sub('', text)
|
try:
|
||||||
return filter(filter_words, p.split_pat.split(text))
|
ans = split_into_words(unicode(text), lang)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return ()
|
||||||
|
return filter(filter_words, ans)
|
||||||
|
else:
|
||||||
|
def get_words(text, lang):
|
||||||
|
p = patterns()
|
||||||
|
return filter(filter_words, p.split_pat.split(text))
|
||||||
|
|
||||||
def add_words(candidates, sourceline, words, file_name, locale):
|
def add_words(text, sourceline, words, file_name, locale):
|
||||||
|
candidates = get_words(text, locale.langcode)
|
||||||
if candidates:
|
if candidates:
|
||||||
loc = Location(file_name, sourceline)
|
p = patterns()
|
||||||
for word in candidates:
|
for word in candidates:
|
||||||
words[(word, locale)].append(loc)
|
sword = p.sanitize_invisible_pat.sub('', word)
|
||||||
|
loc = Location(file_name, sourceline, word)
|
||||||
|
words[(sword, locale)].append(loc)
|
||||||
|
|
||||||
def read_words_from_opf(root, words, file_name, book_locale):
|
def read_words_from_opf(root, words, file_name, book_locale):
|
||||||
for tag in root.xpath('//*[namespace-uri()="%s"]' % OPF_NAMESPACES['dc']):
|
for tag in root.xpath('//*[namespace-uri()="%s"]' % OPF_NAMESPACES['dc']):
|
||||||
tagname = barename(tag.tag)
|
tagname = barename(tag.tag)
|
||||||
if not tag.text or tagname in {'identifier', 'language'}:
|
if not tag.text or tagname in {'identifier', 'language'}:
|
||||||
continue
|
continue
|
||||||
add_words(get_words(tag.text), tag.sourceline, words, file_name, book_locale)
|
add_words(tag.text, tag.sourceline, words, file_name, book_locale)
|
||||||
file_as = '{%s}file-as' % OPF_NAMESPACES['opf']
|
file_as = '{%s}file-as' % OPF_NAMESPACES['opf']
|
||||||
file_as = tag.get(file_as, None)
|
file_as = tag.get(file_as, None)
|
||||||
if file_as:
|
if file_as:
|
||||||
add_words(get_words(file_as), tag.sourceline, words, file_name, book_locale)
|
add_words(file_as, tag.sourceline, words, file_name, book_locale)
|
||||||
|
|
||||||
def read_words_from_ncx(root, words, file_name, book_locale):
|
def read_words_from_ncx(root, words, file_name, book_locale):
|
||||||
for tag in root.xpath('//*[local-name()="text"]'):
|
for tag in root.xpath('//*[local-name()="text"]'):
|
||||||
if not tag.text:
|
if not tag.text:
|
||||||
continue
|
continue
|
||||||
add_words(get_words(tag.text), tag.sourceline, words, file_name, book_locale)
|
add_words(tag.text, tag.sourceline, words, file_name, book_locale)
|
||||||
|
|
||||||
def read_words_from_html_tag(tag, words, file_name, parent_locale, locale):
|
def read_words_from_html_tag(tag, words, file_name, parent_locale, locale):
|
||||||
tagname = barename(tag.tag)
|
tagname = barename(tag.tag)
|
||||||
if tagname not in {'script', 'style', 'link', 'head'}:
|
if tagname not in {'script', 'style', 'link', 'head'}:
|
||||||
if tag.text is not None:
|
if tag.text is not None:
|
||||||
add_words(get_words(tag.text), tag.sourceline, words, file_name, locale)
|
add_words(tag.text, tag.sourceline, words, file_name, locale)
|
||||||
for attr in {'alt', 'title'}:
|
for attr in {'alt', 'title'}:
|
||||||
text = tag.get(attr, None)
|
text = tag.get(attr, None)
|
||||||
if text:
|
if text:
|
||||||
add_words(get_words(text), tag.sourceline, words, file_name, locale)
|
add_words(text, tag.sourceline, words, file_name, locale)
|
||||||
if tag.tail is not None:
|
if tag.tail is not None:
|
||||||
add_words(get_words(tag.tail), tag.sourceline, words, file_name, parent_locale)
|
add_words(tag.tail, tag.sourceline, words, file_name, parent_locale)
|
||||||
|
|
||||||
def locale_from_tag(tag):
|
def locale_from_tag(tag):
|
||||||
if 'lang' in tag.attrib:
|
if 'lang' in tag.attrib:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user