From 15e6c1d212ebe9c4f1a935914ff9dccf2f93e5da Mon Sep 17 00:00:00 2001
From: ldolse inside blocquote, but not the other way around
---
src/calibre/ebooks/conversion/utils.py | 28 ++++++---
src/calibre/ebooks/mobi/reader.py | 3 +
src/calibre/utils/wordcount.py | 83 ++++++++++++++++++++++++++
3 files changed, 105 insertions(+), 9 deletions(-)
create mode 100644 src/calibre/utils/wordcount.py
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 11979b933c..8baeefcd1a 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en'
import re
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log
+from calibre.utils.wordcount import get_wordcount_obj
class PreProcessor(object):
@@ -107,7 +108,7 @@ class PreProcessor(object):
# Arrange line feeds and
[^>]*)>\s*", "\n
"+">", html) - + ###### Check Markup ###### # # some lit files don't have any
tags or equivalent (generally just plain text between @@ -168,9 +169,21 @@ class PreProcessor(object): #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False + + # Count the words in the document to estimate how many chapters to look for + word_count_text = re.sub(r'(?s)
]*>.*?', '', html) + word_count_text = re.sub(r'<[^>]*>', '', word_count_text) + wordcount = get_wordcount_obj(word_count_text) + + #self.dump(html, 'before_chapter_markup') # detect chapters/sections to match xpath or splitting logic # + min_chapters = 10 + heading = re.compile(']*>)', '\g
(]*>\s*){1,})(?P
', self.processed_html) + self.processed_html = re.sub(r'(?i)(?P]*>)\s*(?P
(]*>\s*){1,})', '\g'+'\g', self.processed_html) + def remove_random_bytes(self, html): return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08', diff --git a/src/calibre/utils/wordcount.py b/src/calibre/utils/wordcount.py new file mode 100644 index 0000000000..2bc91f4014 --- /dev/null +++ b/src/calibre/utils/wordcount.py @@ -0,0 +1,83 @@ +#!/usr/bin/python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +""" +Get word, character, and Asian character counts + +1. Get a word count as a dictionary: + wc = get_wordcount(text) + words = wc['words'] # etc. + +2. Get a word count as an object + wc = get_wordcount_obj(text) + words = wc.words # etc. + +properties counted: + * characters + * chars_no_spaces + * asian_chars + * non_asian_words + * words + +Python License +""" +__version__ = 0.1 +__author__ = "Ryan Ginstrom" + +IDEOGRAPHIC_SPACE = 0x3000 + +def is_asian(char): + """Is the character Asian?""" + + # 0x3000 is ideographic space (i.e. double-byte space) + # Anything over is an Asian character + return ord(char) > IDEOGRAPHIC_SPACE + +def filter_jchars(c): + """Filters Asian characters to spaces""" + if is_asian(c): + return ' ' + return c + +def nonj_len(word): + u"""Returns number of non-Asian words in {word} + - 日本語AアジアンB -> 2 + - hello -> 1 + @param word: A word, possibly containing Asian characters + """ + # Here are the steps: + # 本spam日eggs + # -> [' ', 's', 'p', 'a', 'm', ' ', 'e', 'g', 'g', 's'] + # -> ' spam eggs' + # -> ['spam', 'eggs'] + # The length of which is 2! + chars = [filter_jchars(c) for c in word] + return len(u''.join(chars).split()) + +def get_wordcount(text): + """Get the word/character count for text + + @param text: The text of the segment + """ + + characters = len(text) + chars_no_spaces = sum([not x.isspace() for x in text]) + asian_chars = sum([is_asian(x) for x in text]) + non_asian_words = nonj_len(text) + words = non_asian_words + asian_chars + + return dict(characters=characters, + chars_no_spaces=chars_no_spaces, + asian_chars=asian_chars, + non_asian_words=non_asian_words, + words=words) + +def dict2obj(dictionary): + """Transform a dictionary into an object""" + class Obj(object): + def __init__(self, dictionary): + self.__dict__.update(dictionary) + return Obj(dictionary) + +def get_wordcount_obj(text): + """Get the wordcount as an object rather than a dictionary""" + return dict2obj(get_wordcount(text))