From 15e6c1d212ebe9c4f1a935914ff9dccf2f93e5da Mon Sep 17 00:00:00 2001 From: ldolse Date: Tue, 23 Nov 2010 13:54:45 +0800 Subject: [PATCH] bockquotes and paragraph tags also have a conflict in mobi files - lxml allows

inside blocquote, but not the other way around --- src/calibre/ebooks/conversion/utils.py | 28 ++++++--- src/calibre/ebooks/mobi/reader.py | 3 + src/calibre/utils/wordcount.py | 83 ++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 9 deletions(-) create mode 100644 src/calibre/utils/wordcount.py diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 11979b933c..8baeefcd1a 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en' import re from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.utils.logging import default_log +from calibre.utils.wordcount import get_wordcount_obj class PreProcessor(object): @@ -107,7 +108,7 @@ class PreProcessor(object): # Arrange line feeds and

tags so the line_length and no_markup functions work correctly html = re.sub(r"\s*

", "

\n", html) html = re.sub(r"\s*[^>]*)>\s*", "\n"+">", html) - + ###### Check Markup ###### # # some lit files don't have any

tags or equivalent (generally just plain text between @@ -168,9 +169,21 @@ class PreProcessor(object): #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False + + # Count the words in the document to estimate how many chapters to look for + word_count_text = re.sub(r'(?s)]*>.*?', '', html) + word_count_text = re.sub(r'<[^>]*>', '', word_count_text) + wordcount = get_wordcount_obj(word_count_text) + + #self.dump(html, 'before_chapter_markup') # detect chapters/sections to match xpath or splitting logic # + min_chapters = 10 + heading = re.compile(']*>', re.IGNORECASE) + self.html_preprocess_sections = len(heading.findall(html)) + self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") + # Build the Regular Expressions in pieces init_lookahead = "(?=<(p|div))" chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" @@ -192,12 +205,7 @@ class PreProcessor(object): n_lookahead_close = ")" default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)" - - min_chapters = 10 - heading = re.compile(']*>', re.IGNORECASE) - self.html_preprocess_sections = len(heading.findall(html)) - self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") - + chapter_types = [ [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters @@ -219,9 +227,11 @@ class PreProcessor(object): else: chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) - + html = chapdetect.sub(self.chapter_head, html) - + + words_per_chptr = wordcount.words / self.html_preprocess_sections + print "wordcount is: "+ str(wordcount.words)+", Average words per chapter is: "+str(words_per_chptr)+", Marked "+str(self.html_preprocess_sections)+" chapters" ###### Unwrap lines ###### # diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index c4845f9443..309023ede9 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -480,6 +480,9 @@ class MobiReader(object): # - lxml and beautifulsoup expect/assume a specific order based on xhtml spec self.processed_html = re.sub(r'(?i)(?P(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P]*>)', '\g'+'\g', self.processed_html) self.processed_html = re.sub(r'(?i)(?P]*>)\s*(?P(\s*){1,})', '\g'+'\g', self.processed_html) + self.processed_html = re.sub(r'(?i)(?P

(]*>\s*){1,})(?P]*>)', '\g'+'\g
', self.processed_html) + self.processed_html = re.sub(r'(?i)(?P]*>)\s*(?P
(]*>\s*){1,})', '\g
'+'\g', self.processed_html) + def remove_random_bytes(self, html): return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08', diff --git a/src/calibre/utils/wordcount.py b/src/calibre/utils/wordcount.py new file mode 100644 index 0000000000..2bc91f4014 --- /dev/null +++ b/src/calibre/utils/wordcount.py @@ -0,0 +1,83 @@ +#!/usr/bin/python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +""" +Get word, character, and Asian character counts + +1. Get a word count as a dictionary: + wc = get_wordcount(text) + words = wc['words'] # etc. + +2. Get a word count as an object + wc = get_wordcount_obj(text) + words = wc.words # etc. + +properties counted: + * characters + * chars_no_spaces + * asian_chars + * non_asian_words + * words + +Python License +""" +__version__ = 0.1 +__author__ = "Ryan Ginstrom" + +IDEOGRAPHIC_SPACE = 0x3000 + +def is_asian(char): + """Is the character Asian?""" + + # 0x3000 is ideographic space (i.e. double-byte space) + # Anything over is an Asian character + return ord(char) > IDEOGRAPHIC_SPACE + +def filter_jchars(c): + """Filters Asian characters to spaces""" + if is_asian(c): + return ' ' + return c + +def nonj_len(word): + u"""Returns number of non-Asian words in {word} + - 日本語AアジアンB -> 2 + - hello -> 1 + @param word: A word, possibly containing Asian characters + """ + # Here are the steps: + # 本spam日eggs + # -> [' ', 's', 'p', 'a', 'm', ' ', 'e', 'g', 'g', 's'] + # -> ' spam eggs' + # -> ['spam', 'eggs'] + # The length of which is 2! + chars = [filter_jchars(c) for c in word] + return len(u''.join(chars).split()) + +def get_wordcount(text): + """Get the word/character count for text + + @param text: The text of the segment + """ + + characters = len(text) + chars_no_spaces = sum([not x.isspace() for x in text]) + asian_chars = sum([is_asian(x) for x in text]) + non_asian_words = nonj_len(text) + words = non_asian_words + asian_chars + + return dict(characters=characters, + chars_no_spaces=chars_no_spaces, + asian_chars=asian_chars, + non_asian_words=non_asian_words, + words=words) + +def dict2obj(dictionary): + """Transform a dictionary into an object""" + class Obj(object): + def __init__(self, dictionary): + self.__dict__.update(dictionary) + return Obj(dictionary) + +def get_wordcount_obj(text): + """Get the wordcount as an object rather than a dictionary""" + return dict2obj(get_wordcount(text))