From 15e6c1d212ebe9c4f1a935914ff9dccf2f93e5da Mon Sep 17 00:00:00 2001
From: ldolse inside blocquote, but not the other way around
---
src/calibre/ebooks/conversion/utils.py | 28 ++++++---
src/calibre/ebooks/mobi/reader.py | 3 +
src/calibre/utils/wordcount.py | 83 ++++++++++++++++++++++++++
3 files changed, 105 insertions(+), 9 deletions(-)
create mode 100644 src/calibre/utils/wordcount.py
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 11979b933c..8baeefcd1a 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en'
import re
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log
+from calibre.utils.wordcount import get_wordcount_obj
class PreProcessor(object):
@@ -107,7 +108,7 @@ class PreProcessor(object):
# Arrange line feeds and
[^>]*)>\s*", "\n
"+">", html) - + ###### Check Markup ###### # # some lit files don't have any
tags or equivalent (generally just plain text between @@ -168,9 +169,21 @@ class PreProcessor(object): #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False + + # Count the words in the document to estimate how many chapters to look for + word_count_text = re.sub(r'(?s)
]*>.*?', '', html) + word_count_text = re.sub(r'<[^>]*>', '', word_count_text) + wordcount = get_wordcount_obj(word_count_text) + + #self.dump(html, 'before_chapter_markup') # detect chapters/sections to match xpath or splitting logic # + min_chapters = 10 + heading = re.compile(']*>)', '\g
(]*>\s*){1,})(?P
', self.processed_html) + self.processed_html = re.sub(r'(?i)(?P]*>)\s*(?P
(]*>\s*){1,})', '\g'+'\g', self.processed_html) + def remove_random_bytes(self, html): return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08', diff --git a/src/calibre/utils/wordcount.py b/src/calibre/utils/wordcount.py new file mode 100644 index 0000000000..2bc91f4014 --- /dev/null +++ b/src/calibre/utils/wordcount.py @@ -0,0 +1,83 @@ +#!/usr/bin/python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +""" +Get word, character, and Asian character counts + +1. Get a word count as a dictionary: + wc = get_wordcount(text) + words = wc['words'] # etc. + +2. Get a word count as an object + wc = get_wordcount_obj(text) + words = wc.words # etc. + +properties counted: + * characters + * chars_no_spaces + * asian_chars + * non_asian_words + * words + +Python License +""" +__version__ = 0.1 +__author__ = "Ryan Ginstrom" + +IDEOGRAPHIC_SPACE = 0x3000 + +def is_asian(char): + """Is the character Asian?""" + + # 0x3000 is ideographic space (i.e. double-byte space) + # Anything over is an Asian character + return ord(char) > IDEOGRAPHIC_SPACE + +def filter_jchars(c): + """Filters Asian characters to spaces""" + if is_asian(c): + return ' ' + return c + +def nonj_len(word): + u"""Returns number of non-Asian words in {word} + - 日本語AアジアンB -> 2 + - hello -> 1 + @param word: A word, possibly containing Asian characters + """ + # Here are the steps: + # 本spam日eggs + # -> [' ', 's', 'p', 'a', 'm', ' ', 'e', 'g', 'g', 's'] + # -> ' spam eggs' + # -> ['spam', 'eggs'] + # The length of which is 2! + chars = [filter_jchars(c) for c in word] + return len(u''.join(chars).split()) + +def get_wordcount(text): + """Get the word/character count for text + + @param text: The text of the segment + """ + + characters = len(text) + chars_no_spaces = sum([not x.isspace() for x in text]) + asian_chars = sum([is_asian(x) for x in text]) + non_asian_words = nonj_len(text) + words = non_asian_words + asian_chars + + return dict(characters=characters, + chars_no_spaces=chars_no_spaces, + asian_chars=asian_chars, + non_asian_words=non_asian_words, + words=words) + +def dict2obj(dictionary): + """Transform a dictionary into an object""" + class Obj(object): + def __init__(self, dictionary): + self.__dict__.update(dictionary) + return Obj(dictionary) + +def get_wordcount_obj(text): + """Get the wordcount as an object rather than a dictionary""" + return dict2obj(get_wordcount(text)) From a1dcbb33c1c9ff12c8dbc2092ef4172c014dc827 Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 29 Nov 2010 16:38:17 +0800 Subject: [PATCH 02/14] moved chapter markup to a function, tied preprocessing into word count --- src/calibre/ebooks/conversion/utils.py | 149 +++++++++++++++---------- 1 file changed, 87 insertions(+), 62 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 8baeefcd1a..c42068cfe0 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -65,7 +65,7 @@ class PreProcessor(object): inspect. Percent is the minimum percent of line endings which should be marked up to return true. ''' - htm_end_ere = re.compile('', re.DOTALL) + htm_end_ere = re.compile('(p|div)>', re.DOTALL) line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL) htm_end = htm_end_ere.findall(raw) line_end = line_end_ere.findall(raw) @@ -102,12 +102,93 @@ class PreProcessor(object): with open(os.path.join(odir, name), 'wb') as f: f.write(raw.encode('utf-8')) + def get_word_count(self, html): + totalwords = 0 + word_count_text = re.sub(r'(?s)]*>.*?', '', html) + word_count_text = re.sub(r'<[^>]*>', '', word_count_text) + wordcount = get_wordcount_obj(word_count_text) + return wordcount.words + + def markup_chapters(self, html, wordcount, blanks_between_paragraphs): + # Typical chapters are between 2000 and 7000 words, use the larger number to decide the + # minimum of chapters to search for + self.min_chapters = 1 + if wordcount > 7000: + self.min_chapters = wordcount / 7000 + print "minimum chapters required are: "+str(self.min_chapters) + heading = re.compile(' ]*>', re.IGNORECASE) + self.html_preprocess_sections = len(heading.findall(html)) + self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") + + # Build the Regular Expressions in pieces + init_lookahead = "(?=<(p|div))" + chapter_line_open = "<(?P p|div)[^>]*>\s*(<(?P font|span|[ibu])[^>]*>)?\s*(<(?P font|span|[ibu])[^>]*>)?\s*(<(?P font|span|[ibu])[^>]*>)?\s*" + title_line_open = "<(?P p|div)[^>]*>\s*(<(?P font|span|[ibu])[^>]*>)?\s*(<(?P font|span|[ibu])[^>]*>)?\s*(<(?P font|span|[ibu])[^>]*>)?\s*" + chapter_header_open = r"(?P " + title_header_open = r"(?P " + chapter_header_close = ")\s*" + title_header_close = ")" + chapter_line_close = "((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>" + title_line_close = "((?P=inner6)>)?\s*((?P=inner5)>)?\s*((?P=inner4)>)?\s*(?P=outer2)>" + + if blanks_between_paragraphs: + blank_lines = "(\s* ]*>\s*
){0,2}\s*" + else: + blank_lines = "" + opt_title_open = "(" + opt_title_close = ")?" + n_lookahead_open = "\s+(?!" + n_lookahead_close = ")" + + default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)" + + chapter_types = [ + [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], + [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters + [r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)()?\s*", True, "Searching for emphasized lines"], # Emphasized lines + [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles + [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters + ] + + # Start with most typical chapter headings, get more aggressive until one works + for [chapter_type, lookahead_ignorecase, log_message] in chapter_types: + if self.html_preprocess_sections >= self.min_chapters: + break + full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close + n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) + self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message) + if lookahead_ignorecase: + chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) + else: + chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close + chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) + + html = chapdetect.sub(self.chapter_head, html) + + words_per_chptr = wordcount + if words_per_chptr > 0 and self.html_preprocess_sections > 0: + words_per_chptr = wordcount / self.html_preprocess_sections + print "Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters" + + return html + + + def __call__(self, html): self.log("********* Preprocessing HTML *********") + # Count the words in the document to estimate how many chapters to look for and whether + # other types of processing are attempted + totalwords = self.get_word_count(html) + + if totalwords < 10: + print "not enough text, not preprocessing" + return html + # Arrange line feeds and tags so the line_length and no_markup functions work correctly - html = re.sub(r"\s*", "\n", html) - html = re.sub(r"\s*[^>]*)>\s*", "\n
"+">", html) + html = re.sub(r"\s*(?P
p|div)>", ""+"\g "+">\n", html) + html = re.sub(r"\s*<(?P p|div)(?P