Add code to get all words from a ebook with their declared locale

2025-07-09 03:04:10 -04:00 · 2014-04-12 12:05:32 +05:30 · 2014-04-12 12:05:32 +05:30 · 7d442f9320
commit 7d442f9320
parent e476f7d889
2 changed files with 148 additions and 1 deletions
--- a/src/calibre/ebooks/oeb/polish/spell.py
+++ b/src/calibre/ebooks/oeb/polish/spell.py
@ -0,0 +1,146 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import sys
+from collections import defaultdict
+
+from calibre.spell.dictionary import parse_lang_code
+from calibre.ebooks.oeb.base import barename
+from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES, get_container
+from calibre.ebooks.oeb.polish.toc import find_existing_toc
+
+_patterns = None
+
+class Patterns(object):
+
+    __slots__ = ('sanitize_invisible_pat', 'split_pat', 'digit_pat')
+
+    def __init__(self):
+        import regex
+        # Remove soft hyphens/zero width spaces/control codes
+        self.sanitize_invisible_pat = regex.compile(
+            r'[\u00ad\u200b\u200c\u200d\ufeff\0-\x08\x0b\x0c\x0e-\x1f\x7f]', regex.VERSION1 | regex.UNICODE)
+        self.split_pat = regex.compile(
+            r'\W+', flags=regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.UNICODE)
+        self.digit_pat = regex.compile(
+            r'^\d+$', flags=regex.VERSION1 | regex.WORD | regex.UNICODE)
+
+def patterns():
+    global _patterns
+    if _patterns is None:
+        _patterns = Patterns()
+    return _patterns
+
+class Location(object):
+
+    __slots__ = ('file_name', 'sourceline')
+
+    def __init__(self, file_name=None, sourceline=None):
+        self.file_name, self.sourceline = file_name, sourceline
+
+    def __repr__(self):
+        return '%s:%s' % (self.file_name, self.sourceline)
+    __str__ = __repr__
+
+def filter_words(word):
+    if not word:
+        return False
+    p = patterns()
+    if p.digit_pat.match(word) is not None:
+        return False
+    return True
+
+def get_words(text):
+    p = patterns()
+    text = p.sanitize_invisible_pat.sub('', text)
+    return filter(filter_words, p.split_pat.split(text))
+
+def add_words(candidates, sourceline, words, file_name, locale):
+    if candidates:
+        loc = Location(file_name, sourceline)
+        for word in candidates:
+            words[(word, locale)].append(loc)
+
+def read_words_from_opf(root, words, file_name, book_locale):
+    for tag in root.xpath('//*[namespace-uri()="%s"]' % OPF_NAMESPACES['dc']):
+        tagname = barename(tag.tag)
+        if not tag.text or tagname in {'identifier', 'language'}:
+            continue
+        add_words(get_words(tag.text), tag.sourceline, words, file_name, book_locale)
+        file_as = '{%s}file-as' % OPF_NAMESPACES['opf']
+        file_as = tag.get(file_as, None)
+        if file_as:
+            add_words(get_words(file_as), tag.sourceline, words, file_name, book_locale)
+
+def read_words_from_ncx(root, words, file_name, book_locale):
+    for tag in root.xpath('//*[local-name()="text"]'):
+        if not tag.text:
+            continue
+        add_words(get_words(tag.text), tag.sourceline, words, file_name, book_locale)
+
+def read_words_from_html_tag(tag, words, file_name, parent_locale, locale):
+    tagname = barename(tag.tag)
+    if tagname not in {'script', 'style'} and tag.text is not None:
+        add_words(get_words(tag.text), tag.sourceline, words, file_name, locale)
+    if tag.tail is not None:
+        add_words(get_words(tag.tail), tag.sourceline, words, file_name, parent_locale)
+    for attr in ('alt', 'title'):
+        text = tag.get(attr, None)
+        if text:
+            add_words(get_words(text), tag.sourceline, words, file_name, locale)
+
+def locale_from_tag(tag):
+    if 'lang' in tag.attrib:
+        loc = parse_lang_code(tag.get('lang'))
+        if loc is not None:
+            return loc
+    if '{http://www.w3.org/XML/1998/namespace}lang' in tag.attrib:
+        loc = parse_lang_code(tag.get('{http://www.w3.org/XML/1998/namespace}lang'))
+        if loc is not None:
+            return loc
+
+def read_words_from_html(root, words, file_name, book_locale):
+    stack = [(root, book_locale)]
+    while stack:
+        parent, parent_locale = stack.pop()
+        locale = locale_from_tag(parent) or parent_locale
+        read_words_from_html_tag(parent, words, file_name, parent_locale, locale)
+        stack.extend((tag, parent_locale) for tag in parent.iterchildren('*'))
+
+def get_all_words(container, book_locale):
+    words = defaultdict(list)
+    file_names = [name for name, linear in container.spine_names] + [container.opf_name]
+    toc = find_existing_toc(container)
+    if toc is not None and container.exists(toc):
+        file_names.append(toc)
+    for file_name in file_names:
+        if not container.exists(file_name):
+            continue
+        root = container.parsed(file_name)
+        if file_name == container.opf_name:
+            read_words_from_opf(root, words, file_name, book_locale)
+        elif file_name == toc:
+            read_words_from_ncx(root, words, file_name, book_locale)
+        else:
+            read_words_from_html(root, words, file_name, book_locale)
+
+    def group_sort(locations):
+        order = {}
+        for loc in locations:
+            if loc.file_name not in order:
+                order[loc.file_name] = len(order)
+        return sorted(locations, key=lambda l:(order[l.file_name], l.sourceline))
+
+    return {k:group_sort(v) for k, v in words.iteritems()}
+
+if __name__ == '__main__':
+    import pprint
+    from calibre.gui2.tweak_book import set_book_locale, dictionaries
+    container = get_container(sys.argv[-1], tweak_mode=True)
+    set_book_locale(container.mi.language)
+    pprint.pprint(get_all_words(container, dictionaries.default_locale))
--- a/src/calibre/gui2/tweak_book/spell.py
+++ b/src/calibre/gui2/tweak_book/spell.py
@ -113,7 +113,7 @@ class AddDictionary(QDialog):  # {{{
        QDialog.accept(self)
 # }}}

-class ManageDictionaries(Dialog):
+class ManageDictionaries(Dialog):  # {{{

    def __init__(self, parent=None):
        Dialog.__init__(self, _('Manage dictionaries'), 'manage-dictionaries', parent=parent)
@ -288,6 +288,7 @@ class ManageDictionaries(Dialog):
        pl = dprefs['preferred_dictionaries']
        pl[locale] = d.id
        dprefs['preferred_dictionaries'] = pl
+# }}}

 if __name__ == '__main__':
    app = QApplication([])