From 15e6c1d212ebe9c4f1a935914ff9dccf2f93e5da Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 23 Nov 2010 13:54:45 +0800
Subject: [PATCH] bockquotes and paragraph tags also have a conflict in mobi
 files - lxml allows <p> inside blocquote, but not the other way around

---
 src/calibre/ebooks/conversion/utils.py | 28 ++++++---
 src/calibre/ebooks/mobi/reader.py      |  3 +
 src/calibre/utils/wordcount.py         | 83 ++++++++++++++++++++++++++
 3 files changed, 105 insertions(+), 9 deletions(-)
 create mode 100644 src/calibre/utils/wordcount.py
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 11979b933c..8baeefcd1a 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en'
 import re
 from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.utils.logging import default_log
+from calibre.utils.wordcount import get_wordcount_obj
 
 class PreProcessor(object):
 
@@ -107,7 +108,7 @@ class PreProcessor(object):
         # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
         html = re.sub(r"\s*</p>", "</p>\n", html)
         html = re.sub(r"\s*<p(?P<style>[^>]*)>\s*", "\n<p"+"\g<style>"+">", html)
-
+        
         ###### Check Markup ######
         #
         # some lit files don't have any <p> tags or equivalent (generally just plain text between
@@ -168,9 +169,21 @@ class PreProcessor(object):
                #print "blanks between paragraphs is marked True"
             else:
                 blanks_between_paragraphs = False
+                
+        # Count the words in the document to estimate how many chapters to look for
+        word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
+        word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
+        wordcount = get_wordcount_obj(word_count_text)
+        
+        
         #self.dump(html, 'before_chapter_markup')
         # detect chapters/sections to match xpath or splitting logic
         #
+        min_chapters = 10
+        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
+        self.html_preprocess_sections = len(heading.findall(html))
+        self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
+
         # Build the Regular Expressions in pieces
         init_lookahead = "(?=<(p|div))"
         chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
@@ -192,12 +205,7 @@ class PreProcessor(object):
         n_lookahead_close = ")"
 
         default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)"
-
-        min_chapters = 10
-        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
-        self.html_preprocess_sections = len(heading.findall(html))
-        self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
-
+        
         chapter_types = [
             [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
             [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
@@ -219,9 +227,11 @@ class PreProcessor(object):
             else:
                 chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
                 chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
-
+                
             html = chapdetect.sub(self.chapter_head, html)
-
+        
+        words_per_chptr = wordcount.words / self.html_preprocess_sections
+        print "wordcount is: "+ str(wordcount.words)+", Average words per chapter is: "+str(words_per_chptr)+", Marked "+str(self.html_preprocess_sections)+" chapters"
 
         ###### Unwrap lines ######
         #
diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py
index c4845f9443..309023ede9 100644
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@@ -480,6 +480,9 @@ class MobiReader(object):
         # - lxml and beautifulsoup expect/assume a specific order based on xhtml spec
         self.processed_html = re.sub(r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', '\g<para>'+'\g<styletags>', self.processed_html)
         self.processed_html = re.sub(r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', '\g<styletags>'+'\g<para>', self.processed_html)
+        self.processed_html = re.sub(r'(?i)(?P<blockquote>(</blockquote[^>]*>\s*){1,})(?P<para></p[^>]*>)', '\g<para>'+'\g<blockquote>', self.processed_html)
+        self.processed_html = re.sub(r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<blockquote[^>]*>\s*){1,})', '\g<blockquote>'+'\g<para>', self.processed_html)
+
 
     def remove_random_bytes(self, html):
         return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08',
diff --git a/src/calibre/utils/wordcount.py b/src/calibre/utils/wordcount.py
new file mode 100644
index 0000000000..2bc91f4014
--- /dev/null
+++ b/src/calibre/utils/wordcount.py
@@ -0,0 +1,83 @@
+#!/usr/bin/python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+"""
+Get word, character, and Asian character counts
+
+1. Get a word count as a dictionary:
+    wc = get_wordcount(text)
+    words = wc['words'] # etc.
+
+2. Get a word count as an object
+    wc = get_wordcount_obj(text)
+    words = wc.words # etc.
+
+properties counted:
+    * characters
+    * chars_no_spaces
+    * asian_chars
+    * non_asian_words
+    * words
+    
+Python License
+"""
+__version__ = 0.1
+__author__ = "Ryan Ginstrom"
+
+IDEOGRAPHIC_SPACE = 0x3000
+
+def is_asian(char):
+    """Is the character Asian?"""
+
+    # 0x3000 is ideographic space (i.e. double-byte space)
+    # Anything over is an Asian character
+    return ord(char) > IDEOGRAPHIC_SPACE
+
+def filter_jchars(c):
+    """Filters Asian characters to spaces"""
+    if is_asian(c):
+        return ' '
+    return c
+
+def nonj_len(word):
+    u"""Returns number of non-Asian words in {word}
+    - 日本語AアジアンB -> 2
+    - hello -> 1
+    @param word: A word, possibly containing Asian characters
+    """
+    # Here are the steps:
+    # 本spam日eggs
+    # -> [' ', 's', 'p', 'a', 'm', ' ', 'e', 'g', 'g', 's']
+    # -> ' spam eggs'
+    # -> ['spam', 'eggs']
+    # The length of which is 2!
+    chars = [filter_jchars(c) for c in word]
+    return len(u''.join(chars).split())
+
+def get_wordcount(text):
+    """Get the word/character count for text
+
+    @param text: The text of the segment
+    """
+
+    characters = len(text)
+    chars_no_spaces = sum([not x.isspace() for x in text])
+    asian_chars =  sum([is_asian(x) for x in text])
+    non_asian_words = nonj_len(text)
+    words = non_asian_words + asian_chars
+    
+    return dict(characters=characters,
+                chars_no_spaces=chars_no_spaces,
+                asian_chars=asian_chars,
+                non_asian_words=non_asian_words,
+                words=words)
+
+def dict2obj(dictionary):
+    """Transform a dictionary into an object"""
+    class Obj(object):
+        def __init__(self, dictionary):
+            self.__dict__.update(dictionary)
+    return Obj(dictionary)
+
+def get_wordcount_obj(text):
+    """Get the wordcount as an object rather than a dictionary"""
+    return dict2obj(get_wordcount(text))