mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add code to get all words from a ebook with their declared locale
This commit is contained in:
parent
e476f7d889
commit
7d442f9320
146
src/calibre/ebooks/oeb/polish/spell.py
Normal file
146
src/calibre/ebooks/oeb/polish/spell.py
Normal file
@ -0,0 +1,146 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=utf-8
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from calibre.spell.dictionary import parse_lang_code
|
||||||
|
from calibre.ebooks.oeb.base import barename
|
||||||
|
from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES, get_container
|
||||||
|
from calibre.ebooks.oeb.polish.toc import find_existing_toc
|
||||||
|
|
||||||
|
_patterns = None
|
||||||
|
|
||||||
|
class Patterns(object):
|
||||||
|
|
||||||
|
__slots__ = ('sanitize_invisible_pat', 'split_pat', 'digit_pat')
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
import regex
|
||||||
|
# Remove soft hyphens/zero width spaces/control codes
|
||||||
|
self.sanitize_invisible_pat = regex.compile(
|
||||||
|
r'[\u00ad\u200b\u200c\u200d\ufeff\0-\x08\x0b\x0c\x0e-\x1f\x7f]', regex.VERSION1 | regex.UNICODE)
|
||||||
|
self.split_pat = regex.compile(
|
||||||
|
r'\W+', flags=regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.UNICODE)
|
||||||
|
self.digit_pat = regex.compile(
|
||||||
|
r'^\d+$', flags=regex.VERSION1 | regex.WORD | regex.UNICODE)
|
||||||
|
|
||||||
|
def patterns():
|
||||||
|
global _patterns
|
||||||
|
if _patterns is None:
|
||||||
|
_patterns = Patterns()
|
||||||
|
return _patterns
|
||||||
|
|
||||||
|
class Location(object):
|
||||||
|
|
||||||
|
__slots__ = ('file_name', 'sourceline')
|
||||||
|
|
||||||
|
def __init__(self, file_name=None, sourceline=None):
|
||||||
|
self.file_name, self.sourceline = file_name, sourceline
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '%s:%s' % (self.file_name, self.sourceline)
|
||||||
|
__str__ = __repr__
|
||||||
|
|
||||||
|
def filter_words(word):
|
||||||
|
if not word:
|
||||||
|
return False
|
||||||
|
p = patterns()
|
||||||
|
if p.digit_pat.match(word) is not None:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def get_words(text):
|
||||||
|
p = patterns()
|
||||||
|
text = p.sanitize_invisible_pat.sub('', text)
|
||||||
|
return filter(filter_words, p.split_pat.split(text))
|
||||||
|
|
||||||
|
def add_words(candidates, sourceline, words, file_name, locale):
|
||||||
|
if candidates:
|
||||||
|
loc = Location(file_name, sourceline)
|
||||||
|
for word in candidates:
|
||||||
|
words[(word, locale)].append(loc)
|
||||||
|
|
||||||
|
def read_words_from_opf(root, words, file_name, book_locale):
|
||||||
|
for tag in root.xpath('//*[namespace-uri()="%s"]' % OPF_NAMESPACES['dc']):
|
||||||
|
tagname = barename(tag.tag)
|
||||||
|
if not tag.text or tagname in {'identifier', 'language'}:
|
||||||
|
continue
|
||||||
|
add_words(get_words(tag.text), tag.sourceline, words, file_name, book_locale)
|
||||||
|
file_as = '{%s}file-as' % OPF_NAMESPACES['opf']
|
||||||
|
file_as = tag.get(file_as, None)
|
||||||
|
if file_as:
|
||||||
|
add_words(get_words(file_as), tag.sourceline, words, file_name, book_locale)
|
||||||
|
|
||||||
|
def read_words_from_ncx(root, words, file_name, book_locale):
|
||||||
|
for tag in root.xpath('//*[local-name()="text"]'):
|
||||||
|
if not tag.text:
|
||||||
|
continue
|
||||||
|
add_words(get_words(tag.text), tag.sourceline, words, file_name, book_locale)
|
||||||
|
|
||||||
|
def read_words_from_html_tag(tag, words, file_name, parent_locale, locale):
|
||||||
|
tagname = barename(tag.tag)
|
||||||
|
if tagname not in {'script', 'style'} and tag.text is not None:
|
||||||
|
add_words(get_words(tag.text), tag.sourceline, words, file_name, locale)
|
||||||
|
if tag.tail is not None:
|
||||||
|
add_words(get_words(tag.tail), tag.sourceline, words, file_name, parent_locale)
|
||||||
|
for attr in ('alt', 'title'):
|
||||||
|
text = tag.get(attr, None)
|
||||||
|
if text:
|
||||||
|
add_words(get_words(text), tag.sourceline, words, file_name, locale)
|
||||||
|
|
||||||
|
def locale_from_tag(tag):
|
||||||
|
if 'lang' in tag.attrib:
|
||||||
|
loc = parse_lang_code(tag.get('lang'))
|
||||||
|
if loc is not None:
|
||||||
|
return loc
|
||||||
|
if '{http://www.w3.org/XML/1998/namespace}lang' in tag.attrib:
|
||||||
|
loc = parse_lang_code(tag.get('{http://www.w3.org/XML/1998/namespace}lang'))
|
||||||
|
if loc is not None:
|
||||||
|
return loc
|
||||||
|
|
||||||
|
def read_words_from_html(root, words, file_name, book_locale):
|
||||||
|
stack = [(root, book_locale)]
|
||||||
|
while stack:
|
||||||
|
parent, parent_locale = stack.pop()
|
||||||
|
locale = locale_from_tag(parent) or parent_locale
|
||||||
|
read_words_from_html_tag(parent, words, file_name, parent_locale, locale)
|
||||||
|
stack.extend((tag, parent_locale) for tag in parent.iterchildren('*'))
|
||||||
|
|
||||||
|
def get_all_words(container, book_locale):
|
||||||
|
words = defaultdict(list)
|
||||||
|
file_names = [name for name, linear in container.spine_names] + [container.opf_name]
|
||||||
|
toc = find_existing_toc(container)
|
||||||
|
if toc is not None and container.exists(toc):
|
||||||
|
file_names.append(toc)
|
||||||
|
for file_name in file_names:
|
||||||
|
if not container.exists(file_name):
|
||||||
|
continue
|
||||||
|
root = container.parsed(file_name)
|
||||||
|
if file_name == container.opf_name:
|
||||||
|
read_words_from_opf(root, words, file_name, book_locale)
|
||||||
|
elif file_name == toc:
|
||||||
|
read_words_from_ncx(root, words, file_name, book_locale)
|
||||||
|
else:
|
||||||
|
read_words_from_html(root, words, file_name, book_locale)
|
||||||
|
|
||||||
|
def group_sort(locations):
|
||||||
|
order = {}
|
||||||
|
for loc in locations:
|
||||||
|
if loc.file_name not in order:
|
||||||
|
order[loc.file_name] = len(order)
|
||||||
|
return sorted(locations, key=lambda l:(order[l.file_name], l.sourceline))
|
||||||
|
|
||||||
|
return {k:group_sort(v) for k, v in words.iteritems()}
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import pprint
|
||||||
|
from calibre.gui2.tweak_book import set_book_locale, dictionaries
|
||||||
|
container = get_container(sys.argv[-1], tweak_mode=True)
|
||||||
|
set_book_locale(container.mi.language)
|
||||||
|
pprint.pprint(get_all_words(container, dictionaries.default_locale))
|
@ -113,7 +113,7 @@ class AddDictionary(QDialog): # {{{
|
|||||||
QDialog.accept(self)
|
QDialog.accept(self)
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
class ManageDictionaries(Dialog):
|
class ManageDictionaries(Dialog): # {{{
|
||||||
|
|
||||||
def __init__(self, parent=None):
|
def __init__(self, parent=None):
|
||||||
Dialog.__init__(self, _('Manage dictionaries'), 'manage-dictionaries', parent=parent)
|
Dialog.__init__(self, _('Manage dictionaries'), 'manage-dictionaries', parent=parent)
|
||||||
@ -288,6 +288,7 @@ class ManageDictionaries(Dialog):
|
|||||||
pl = dprefs['preferred_dictionaries']
|
pl = dprefs['preferred_dictionaries']
|
||||||
pl[locale] = d.id
|
pl[locale] = d.id
|
||||||
dprefs['preferred_dictionaries'] = pl
|
dprefs['preferred_dictionaries'] = pl
|
||||||
|
# }}}
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
app = QApplication([])
|
app = QApplication([])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user