From 15e6c1d212ebe9c4f1a935914ff9dccf2f93e5da Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 23 Nov 2010 13:54:45 +0800
Subject: [PATCH 01/34] bockquotes and paragraph tags also have a conflict in
 mobi files - lxml allows <p> inside blocquote, but not the other way around

---
 src/calibre/ebooks/conversion/utils.py | 28 ++++++---
 src/calibre/ebooks/mobi/reader.py      |  3 +
 src/calibre/utils/wordcount.py         | 83 ++++++++++++++++++++++++++
 3 files changed, 105 insertions(+), 9 deletions(-)
 create mode 100644 src/calibre/utils/wordcount.py
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 11979b933c..8baeefcd1a 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en'
 import re
 from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.utils.logging import default_log
+from calibre.utils.wordcount import get_wordcount_obj
 
 class PreProcessor(object):
 
@@ -107,7 +108,7 @@ class PreProcessor(object):
         # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
         html = re.sub(r"\s*</p>", "</p>\n", html)
         html = re.sub(r"\s*<p(?P<style>[^>]*)>\s*", "\n<p"+"\g<style>"+">", html)
-
+        
         ###### Check Markup ######
         #
         # some lit files don't have any <p> tags or equivalent (generally just plain text between
@@ -168,9 +169,21 @@ class PreProcessor(object):
                #print "blanks between paragraphs is marked True"
             else:
                 blanks_between_paragraphs = False
+                
+        # Count the words in the document to estimate how many chapters to look for
+        word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
+        word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
+        wordcount = get_wordcount_obj(word_count_text)
+        
+        
         #self.dump(html, 'before_chapter_markup')
         # detect chapters/sections to match xpath or splitting logic
         #
+        min_chapters = 10
+        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
+        self.html_preprocess_sections = len(heading.findall(html))
+        self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
+
         # Build the Regular Expressions in pieces
         init_lookahead = "(?=<(p|div))"
         chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
@@ -192,12 +205,7 @@ class PreProcessor(object):
         n_lookahead_close = ")"
 
         default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)"
-
-        min_chapters = 10
-        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
-        self.html_preprocess_sections = len(heading.findall(html))
-        self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
-
+        
         chapter_types = [
             [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
             [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
@@ -219,9 +227,11 @@ class PreProcessor(object):
             else:
                 chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
                 chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
-
+                
             html = chapdetect.sub(self.chapter_head, html)
-
+        
+        words_per_chptr = wordcount.words / self.html_preprocess_sections
+        print "wordcount is: "+ str(wordcount.words)+", Average words per chapter is: "+str(words_per_chptr)+", Marked "+str(self.html_preprocess_sections)+" chapters"
 
         ###### Unwrap lines ######
         #
diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py
index c4845f9443..309023ede9 100644
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@@ -480,6 +480,9 @@ class MobiReader(object):
         # - lxml and beautifulsoup expect/assume a specific order based on xhtml spec
         self.processed_html = re.sub(r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', '\g<para>'+'\g<styletags>', self.processed_html)
         self.processed_html = re.sub(r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', '\g<styletags>'+'\g<para>', self.processed_html)
+        self.processed_html = re.sub(r'(?i)(?P<blockquote>(</blockquote[^>]*>\s*){1,})(?P<para></p[^>]*>)', '\g<para>'+'\g<blockquote>', self.processed_html)
+        self.processed_html = re.sub(r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<blockquote[^>]*>\s*){1,})', '\g<blockquote>'+'\g<para>', self.processed_html)
+
 
     def remove_random_bytes(self, html):
         return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08',
diff --git a/src/calibre/utils/wordcount.py b/src/calibre/utils/wordcount.py
new file mode 100644
index 0000000000..2bc91f4014
--- /dev/null
+++ b/src/calibre/utils/wordcount.py
@@ -0,0 +1,83 @@
+#!/usr/bin/python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+"""
+Get word, character, and Asian character counts
+
+1. Get a word count as a dictionary:
+    wc = get_wordcount(text)
+    words = wc['words'] # etc.
+
+2. Get a word count as an object
+    wc = get_wordcount_obj(text)
+    words = wc.words # etc.
+
+properties counted:
+    * characters
+    * chars_no_spaces
+    * asian_chars
+    * non_asian_words
+    * words
+    
+Python License
+"""
+__version__ = 0.1
+__author__ = "Ryan Ginstrom"
+
+IDEOGRAPHIC_SPACE = 0x3000
+
+def is_asian(char):
+    """Is the character Asian?"""
+
+    # 0x3000 is ideographic space (i.e. double-byte space)
+    # Anything over is an Asian character
+    return ord(char) > IDEOGRAPHIC_SPACE
+
+def filter_jchars(c):
+    """Filters Asian characters to spaces"""
+    if is_asian(c):
+        return ' '
+    return c
+
+def nonj_len(word):
+    u"""Returns number of non-Asian words in {word}
+    - 日本語AアジアンB -> 2
+    - hello -> 1
+    @param word: A word, possibly containing Asian characters
+    """
+    # Here are the steps:
+    # 本spam日eggs
+    # -> [' ', 's', 'p', 'a', 'm', ' ', 'e', 'g', 'g', 's']
+    # -> ' spam eggs'
+    # -> ['spam', 'eggs']
+    # The length of which is 2!
+    chars = [filter_jchars(c) for c in word]
+    return len(u''.join(chars).split())
+
+def get_wordcount(text):
+    """Get the word/character count for text
+
+    @param text: The text of the segment
+    """
+
+    characters = len(text)
+    chars_no_spaces = sum([not x.isspace() for x in text])
+    asian_chars =  sum([is_asian(x) for x in text])
+    non_asian_words = nonj_len(text)
+    words = non_asian_words + asian_chars
+    
+    return dict(characters=characters,
+                chars_no_spaces=chars_no_spaces,
+                asian_chars=asian_chars,
+                non_asian_words=non_asian_words,
+                words=words)
+
+def dict2obj(dictionary):
+    """Transform a dictionary into an object"""
+    class Obj(object):
+        def __init__(self, dictionary):
+            self.__dict__.update(dictionary)
+    return Obj(dictionary)
+
+def get_wordcount_obj(text):
+    """Get the wordcount as an object rather than a dictionary"""
+    return dict2obj(get_wordcount(text))

From a1dcbb33c1c9ff12c8dbc2092ef4172c014dc827 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 29 Nov 2010 16:38:17 +0800
Subject: [PATCH 02/34] moved chapter markup to a function, tied preprocessing
 into word count

---
 src/calibre/ebooks/conversion/utils.py | 149 +++++++++++++++----------
 1 file changed, 87 insertions(+), 62 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 8baeefcd1a..c42068cfe0 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -65,7 +65,7 @@ class PreProcessor(object):
         inspect.  Percent is the minimum percent of line endings which should
         be marked up to return true.
         '''
-        htm_end_ere = re.compile('</p>', re.DOTALL)
+        htm_end_ere = re.compile('</(p|div)>', re.DOTALL)
         line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
         htm_end = htm_end_ere.findall(raw)
         line_end = line_end_ere.findall(raw)
@@ -102,12 +102,93 @@ class PreProcessor(object):
                 with open(os.path.join(odir, name), 'wb') as f:
                     f.write(raw.encode('utf-8'))
 
+    def get_word_count(self, html):
+        totalwords = 0
+        word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
+        word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
+        wordcount = get_wordcount_obj(word_count_text)
+        return wordcount.words
+
+    def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
+        # Typical chapters are between 2000 and 7000 words, use the larger number to decide the 
+        # minimum of chapters to search for
+        self.min_chapters = 1
+        if wordcount > 7000:
+            self.min_chapters = wordcount / 7000
+        print "minimum chapters required are: "+str(self.min_chapters)
+        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
+        self.html_preprocess_sections = len(heading.findall(html))
+        self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
+
+        # Build the Regular Expressions in pieces
+        init_lookahead = "(?=<(p|div))"
+        chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
+        title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
+        chapter_header_open = r"(?P<chap>"
+        title_header_open = r"(?P<title>"
+        chapter_header_close = ")\s*"
+        title_header_close = ")"
+        chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
+        title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
+
+        if blanks_between_paragraphs:
+            blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
+        else:
+            blank_lines = ""
+        opt_title_open = "("
+        opt_title_close = ")?"
+        n_lookahead_open = "\s+(?!"
+        n_lookahead_close = ")"
+
+        default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)"
+        
+        chapter_types = [
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
+            [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
+            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
+            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
+            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
+            ]
+
+        # Start with most typical chapter headings, get more aggressive until one works
+        for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
+            if self.html_preprocess_sections >= self.min_chapters:
+                break
+            full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
+            n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
+            self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
+            if lookahead_ignorecase:
+                chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+                chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+            else:
+                chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
+                chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
+                
+            html = chapdetect.sub(self.chapter_head, html)
+
+        words_per_chptr = wordcount
+        if words_per_chptr > 0 and self.html_preprocess_sections > 0:
+            words_per_chptr = wordcount / self.html_preprocess_sections
+        print "Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters"            
+
+        return html
+
+
+
     def __call__(self, html):
         self.log("*********  Preprocessing HTML  *********")
 
+        # Count the words in the document to estimate how many chapters to look for and whether
+        # other types of processing are attempted
+        totalwords = self.get_word_count(html)
+        
+        if totalwords < 10:
+            print "not enough text, not preprocessing"
+            return html
+
         # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
-        html = re.sub(r"\s*</p>", "</p>\n", html)
-        html = re.sub(r"\s*<p(?P<style>[^>]*)>\s*", "\n<p"+"\g<style>"+">", html)
+        html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
+        html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
         
         ###### Check Markup ######
         #
@@ -170,68 +251,12 @@ class PreProcessor(object):
             else:
                 blanks_between_paragraphs = False
                 
-        # Count the words in the document to estimate how many chapters to look for
-        word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
-        word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
-        wordcount = get_wordcount_obj(word_count_text)
-        
-        
         #self.dump(html, 'before_chapter_markup')
         # detect chapters/sections to match xpath or splitting logic
         #
-        min_chapters = 10
-        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
-        self.html_preprocess_sections = len(heading.findall(html))
-        self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
 
-        # Build the Regular Expressions in pieces
-        init_lookahead = "(?=<(p|div))"
-        chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
-        title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
-        chapter_header_open = r"(?P<chap>"
-        title_header_open = r"(?P<title>"
-        chapter_header_close = ")\s*"
-        title_header_close = ")"
-        chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
-        title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
+        self.markup_chapters(html, totalwords, blanks_between_paragraphs)
 
-        if blanks_between_paragraphs:
-            blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
-        else:
-            blank_lines = ""
-        opt_title_open = "("
-        opt_title_close = ")?"
-        n_lookahead_open = "\s+(?!"
-        n_lookahead_close = ")"
-
-        default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)"
-        
-        chapter_types = [
-            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
-            [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
-            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
-            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
-            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
-            ]
-
-        # Start with most typical chapter headings, get more aggressive until one works
-        for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
-            if self.html_preprocess_sections >= min_chapters:
-                break
-            full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
-            n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
-            self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
-            if lookahead_ignorecase:
-                chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-                chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
-            else:
-                chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
-                chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
-                
-            html = chapdetect.sub(self.chapter_head, html)
-        
-        words_per_chptr = wordcount.words / self.html_preprocess_sections
-        print "wordcount is: "+ str(wordcount.words)+", Average words per chapter is: "+str(words_per_chptr)+", Marked "+str(self.html_preprocess_sections)+" chapters"
 
         ###### Unwrap lines ######
         #
@@ -257,7 +282,7 @@ class PreProcessor(object):
         # Calculate Length
         unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
         length = docanalysis.line_length(unwrap_factor)
-        self.log("*** Median line length is " + unicode(length) + ", calculated with " + format + " format ***")
+        self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
         # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
         if hardbreaks or unwrap_factor < 0.4:
             self.log("Unwrapping required, unwrapping Lines")
@@ -286,7 +311,7 @@ class PreProcessor(object):
         html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
 
         # If still no sections after unwrapping mark split points on lines with no punctuation
-        if self.html_preprocess_sections < 5:
+        if self.html_preprocess_sections < self.min_chapters:
             self.log("Looking for more split points based on punctuation,"
                     " currently have " + unicode(self.html_preprocess_sections))
             chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)

From e9f5cb683db7425c8fdf6c01523d69e085f221e4 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 10 Dec 2010 13:29:36 -0800
Subject: [PATCH 03/34] tweaked chapter markup function

---
 src/calibre/ebooks/conversion/utils.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index c42068cfe0..0665cccb14 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -103,7 +103,6 @@ class PreProcessor(object):
                     f.write(raw.encode('utf-8'))
 
     def get_word_count(self, html):
-        totalwords = 0
         word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
         word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
         wordcount = get_wordcount_obj(word_count_text)
@@ -162,15 +161,13 @@ class PreProcessor(object):
                 chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
             else:
                 chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
-                chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
-                
+                chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)               
             html = chapdetect.sub(self.chapter_head, html)
 
         words_per_chptr = wordcount
         if words_per_chptr > 0 and self.html_preprocess_sections > 0:
             words_per_chptr = wordcount / self.html_preprocess_sections
         print "Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters"            
-
         return html
 
 
@@ -180,10 +177,11 @@ class PreProcessor(object):
 
         # Count the words in the document to estimate how many chapters to look for and whether
         # other types of processing are attempted
+        totalwords = 0
         totalwords = self.get_word_count(html)
         
-        if totalwords < 10:
-            print "not enough text, not preprocessing"
+        if totalwords < 20:
+            self.log("not enough text, not preprocessing")
             return html
 
         # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
@@ -255,7 +253,7 @@ class PreProcessor(object):
         # detect chapters/sections to match xpath or splitting logic
         #
 
-        self.markup_chapters(html, totalwords, blanks_between_paragraphs)
+        html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
 
 
         ###### Unwrap lines ######

From 2aa0a8d38aae8e9f8ee312954698808782b4c884 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 11 Dec 2010 15:08:28 -0500
Subject: [PATCH 04/34] handle br tags to render in ADE

---
 src/calibre/ebooks/conversion/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 0665cccb14..cda9d9cbba 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -227,6 +227,8 @@ class PreProcessor(object):
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
         html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
+        # ADE doesn't render <br />, change to empty paragraphs
+        html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
 
         # If more than 40% of the lines are empty paragraphs and the user has enabled remove
         # paragraph spacing then delete blank lines to clean up spacing

From 13dbd42f35d3b5bc36ee3fa46a8ec2cab19e1d71 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 13 Dec 2010 16:19:50 -0500
Subject: [PATCH 05/34] Added ellipsis substitutions to the smarten punctuation
 option

---
 src/calibre/ebooks/conversion/preprocess.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 3ff816b3bf..bc4df4233a 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -554,5 +554,8 @@ class HTMLPreProcessor(object):
         html = smartyPants(html)
         html = html.replace(start, '<!--')
         html = html.replace(stop, '-->')
+        # convert ellipsis to entities to prevent unwrapping
+        html = re.sub('(?u)(?<=\w)(\.\s?){3}', '&hellip;', html)
+        html = re.sub('(?u)(?<=\w)\s(\.\s?){3}', '&nbsp;&hellip;', html)
         return substitute_entites(html)
 

From 10b3353f57cea9f3bc18d29b367acedd05832162 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 14 Dec 2010 12:16:32 -0500
Subject: [PATCH 06/34] tweaked the ellipsis substitution

---
 src/calibre/ebooks/conversion/preprocess.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index bc4df4233a..3385771228 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -554,8 +554,8 @@ class HTMLPreProcessor(object):
         html = smartyPants(html)
         html = html.replace(start, '<!--')
         html = html.replace(stop, '-->')
-        # convert ellipsis to entities to prevent unwrapping
-        html = re.sub('(?u)(?<=\w)(\.\s?){3}', '&hellip;', html)
-        html = re.sub('(?u)(?<=\w)\s(\.\s?){3}', '&nbsp;&hellip;', html)
+        # convert ellipsis to entities to prevent wrapping
+        html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
+        # nbsp gets changed to space: html = re.sub('(?u)(?<=\w)\s(\.\s?){2}\.', '&nbsp;&hellip;', html)
         return substitute_entites(html)
 

From 0c2ab9e32838933e0b3731f8cca72a0e98c36730 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 17 Dec 2010 02:09:25 -0500
Subject: [PATCH 07/34] merged pdf chapter markup with preprocess markup

---
 src/calibre/ebooks/conversion/preprocess.py | 27 +++++++++++++++------
 src/calibre/ebooks/conversion/utils.py      | 18 ++++++++++++--
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 3385771228..310a636022 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -364,12 +364,15 @@ class HTMLPreProcessor(object):
                   (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
 
                   # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
+                  #(re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
                   # Cover the case where every letter in a chapter title is separated by a space
-                  (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
+                  #(re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
 
-                  # Have paragraphs show better
-                  (re.compile(r'<br.*?>'), lambda match : '<p>'),
+                  # Convert line breaks to paragraphs
+                  (re.compile(r'<br[^>]*>\s*'), lambda match : '</p>\n<p>'),
+                  (re.compile(r'<body[^>]*>\s*'), lambda match : '<body>\n<p>'),
+                  (re.compile(r'\s*</body>'), lambda match : '</p>\n</body>'),
+                  
                   # Clean up spaces
                   (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
                   # Add space before and after italics
@@ -455,9 +458,9 @@ class HTMLPreProcessor(object):
         # delete soft hyphens - moved here so it's executed after header/footer removal
         if is_pdftohtml:
             # unwrap/delete soft hyphens
-            end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
+            end_rules.append((re.compile(u'[­](</p>\s*<p>\s*)+\s*(?=[[a-z\d])'), lambda match: ''))
             # unwrap/delete soft hyphens with formatting
-            end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
+            end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
 
         # Make the more aggressive chapter marking regex optional with the preprocess option to
         # reduce false positives and move after header/footer removal
@@ -475,7 +478,7 @@ class HTMLPreProcessor(object):
                 end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
                 end_rules.append(
                     # Un wrap using punctuation
-                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                 )
 
         for rule in self.PREPROCESS + start_rules:
@@ -508,7 +511,15 @@ class HTMLPreProcessor(object):
         if is_pdftohtml and length > -1:
             # Dehyphenate
             dehyphenator = Dehyphenator()
-            html = dehyphenator(html,'pdf', length)
+            html = dehyphenator(html,'html', length)
+
+        if is_pdftohtml:
+            from calibre.ebooks.conversion.utils import PreProcessor
+            pdf_markup = PreProcessor(self.extra_opts, None)
+            totalwords = 0
+            totalwords = pdf_markup.get_word_count(html)
+            if totalwords > 7000:
+                html = pdf_markup.markup_chapters(html, totalwords, True)
 
         #dump(html, 'post-preprocess')
 
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index cda9d9cbba..3fd7f88434 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -18,6 +18,9 @@ class PreProcessor(object):
         self.found_indents = 0
         self.extra_opts = extra_opts
 
+    def is_pdftohtml(self, src):
+        return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
+
     def chapter_head(self, match):
         chap = match.group('chap')
         title = match.group('title')
@@ -130,6 +133,15 @@ class PreProcessor(object):
         chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
         title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
 
+        is_pdftohtml = self.is_pdftohtml(html)
+        if is_pdftohtml:
+            print "this is a pdf"
+        chapter_line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
+        chapter_line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
+        title_line_open = "<(?P<outer2>p)[^>]*>\s*"
+        title_line_close = "\s*</(?P=outer2)>"
+
+
         if blanks_between_paragraphs:
             blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
         else:
@@ -139,11 +151,13 @@ class PreProcessor(object):
         n_lookahead_open = "\s+(?!"
         n_lookahead_close = ")"
 
-        default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)"
+        default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
         
         chapter_types = [
             [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
-            [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
+            [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
+            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"],  # Spaced Lettering
+            [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"],  # Numeric Chapters, no dot or colon
             [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
             [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
             [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters

From 24da52303d96e3417d5f347e9d0248abb9af6970 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Thu, 23 Dec 2010 13:52:13 -0500
Subject: [PATCH 08/34] added more non-ascii lower-case characters to the
 unwrap expressions

---
 src/calibre/ebooks/conversion/preprocess.py | 2 +-
 src/calibre/ebooks/conversion/utils.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 310a636022..ca74b04e8d 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -478,7 +478,7 @@ class HTMLPreProcessor(object):
                 end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
                 end_rules.append(
                     # Un wrap using punctuation
-                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                 )
 
         for rule in self.PREPROCESS + start_rules:
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 3fd7f88434..2176f0811a 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -309,7 +309,7 @@ class PreProcessor(object):
             self.log("Done dehyphenating")
             # Unwrap lines using punctation and line length
             #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
-            unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+            unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
             html = unwrap.sub(' ', html)
             #check any remaining hyphens, but only unwrap if there is a match
             dehyphenator = Dehyphenator()

From 7008e2a23a8697c6418fe56501f631fbc3e1c63d Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 28 Dec 2010 10:39:45 -0500
Subject: [PATCH 09/34] fixed some indents

---
 src/calibre/ebooks/conversion/utils.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 2176f0811a..56c9c9673e 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -135,11 +135,10 @@ class PreProcessor(object):
 
         is_pdftohtml = self.is_pdftohtml(html)
         if is_pdftohtml:
-            print "this is a pdf"
-        chapter_line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
-        chapter_line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
-        title_line_open = "<(?P<outer2>p)[^>]*>\s*"
-        title_line_close = "\s*</(?P=outer2)>"
+            chapter_line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
+            chapter_line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
+            title_line_open = "<(?P<outer2>p)[^>]*>\s*"
+            title_line_close = "\s*</(?P=outer2)>"
 
 
         if blanks_between_paragraphs:

From a2e47dae8fc92312af7481a4546ca6fee698f7ad Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 3 Jan 2011 01:11:34 -0500
Subject: [PATCH 10/34] reordered chapter priority, added em-dashes to smarten
 punctuation

---
 src/calibre/ebooks/conversion/preprocess.py | 5 +++--
 src/calibre/ebooks/conversion/utils.py      | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index ca74b04e8d..7f27d7a465 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -353,7 +353,7 @@ class HTMLPreProcessor(object):
                   (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
 
                   # Center separator lines
-                  (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
+                  (re.compile(u'<br>\s*(?P<break>([*#•✦]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
 
                   # Remove page links
                   (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
@@ -567,6 +567,7 @@ class HTMLPreProcessor(object):
         html = html.replace(stop, '-->')
         # convert ellipsis to entities to prevent wrapping
         html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
-        # nbsp gets changed to space: html = re.sub('(?u)(?<=\w)\s(\.\s?){2}\.', '&nbsp;&hellip;', html)
+        # convert double dashes to em-dash
+        html = re.sub('\s--\s', u'\u2014', html)
         return substitute_entites(html)
 
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 56c9c9673e..51f81978cf 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -156,9 +156,9 @@ class PreProcessor(object):
             [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
             [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
             [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"],  # Spaced Lettering
-            [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"],  # Numeric Chapters, no dot or colon
             [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
             [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
+            [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"],  # Numeric Chapters, no dot or colon
             [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
             ]
 

From 760d4d2fd35b2dc4284c2798a184a89b241438b6 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 7 Jan 2011 01:48:23 +0800
Subject: [PATCH 11/34] added preface to the list of common chapter headings

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 51f81978cf..ec83d36cfc 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -153,7 +153,7 @@ class PreProcessor(object):
         default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
         
         chapter_types = [
-            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
             [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
             [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"],  # Spaced Lettering
             [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines

From 93bd1df11adc6fb33ed518fe898696f99e7ed3d1 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 7 Jan 2011 01:57:00 +0800
Subject: [PATCH 12/34] cleaned up comments

---
 src/calibre/ebooks/conversion/preprocess.py | 5 -----
 src/calibre/utils/wordcount.py              | 4 +++-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 7f27d7a465..67be59083e 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -363,11 +363,6 @@ class HTMLPreProcessor(object):
                   # Remove gray background
                   (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
 
-                  # Detect Chapters to match default XPATH in GUI
-                  #(re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
-                  # Cover the case where every letter in a chapter title is separated by a space
-                  #(re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
-
                   # Convert line breaks to paragraphs
                   (re.compile(r'<br[^>]*>\s*'), lambda match : '</p>\n<p>'),
                   (re.compile(r'<body[^>]*>\s*'), lambda match : '<body>\n<p>'),
diff --git a/src/calibre/utils/wordcount.py b/src/calibre/utils/wordcount.py
index 2bc91f4014..b317f99469 100644
--- a/src/calibre/utils/wordcount.py
+++ b/src/calibre/utils/wordcount.py
@@ -18,7 +18,9 @@ properties counted:
     * non_asian_words
     * words
     
-Python License
+Sourced from: 
+http://ginstrom.com/scribbles/2008/05/17/counting-words-etc-in-an-html-file-with-python/
+http://ginstrom.com/scribbles/2007/10/06/counting-words-characters-and-asian-characters-with-python/
 """
 __version__ = 0.1
 __author__ = "Ryan Ginstrom"

From 482c15e16ec8b7ce373c6048684b7565548fb62e Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 7 Jan 2011 02:34:52 +0800
Subject: [PATCH 13/34] removed debug statements

---
 src/calibre/ebooks/conversion/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index ec83d36cfc..2e1ee5852e 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -117,7 +117,7 @@ class PreProcessor(object):
         self.min_chapters = 1
         if wordcount > 7000:
             self.min_chapters = wordcount / 7000
-        print "minimum chapters required are: "+str(self.min_chapters)
+        #print "minimum chapters required are: "+str(self.min_chapters)
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
         self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
@@ -180,7 +180,7 @@ class PreProcessor(object):
         words_per_chptr = wordcount
         if words_per_chptr > 0 and self.html_preprocess_sections > 0:
             words_per_chptr = wordcount / self.html_preprocess_sections
-        print "Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters"            
+        self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")        
         return html
 
 

From 439b8c0f213d3b27888086b67619198c0722705f Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 7 Jan 2011 03:40:47 +0800
Subject: [PATCH 14/34] delete microsoft smart tags during preprocess

---
 src/calibre/ebooks/conversion/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 6d2d123b10..4bb96ac088 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -234,8 +234,11 @@ class PreProcessor(object):
             self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
         # remove remaining non-breaking spaces
         html = re.sub(ur'\u00a0', ' ', html)
+        # Get rid of various common microsoft specific tags which can cause issues later
         # Get rid of empty <o:p> tags to simplify other processing
         html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
+        # Delete microsoft 'smart' tags
+        html = re.sub('(?i)</?st1:\w+>', '', html)
         # Get rid of empty span, bold, & italics tags
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
         html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)

From 9af7ba996f40dc4979df720f9d5bdcf36a8c14da Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 6 Jan 2011 14:40:01 -0700
Subject: [PATCH 15/34] If the user specifies an unknown encoding,
 automatically change it to None

---
 src/calibre/gui2/convert/__init__.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/calibre/gui2/convert/__init__.py b/src/calibre/gui2/convert/__init__.py
index e5f72099fe..ea7a24510a 100644
--- a/src/calibre/gui2/convert/__init__.py
+++ b/src/calibre/gui2/convert/__init__.py
@@ -6,7 +6,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-import textwrap
+import textwrap, codecs
 from functools import partial
 
 from PyQt4.Qt import QWidget, QSpinBox, QDoubleSpinBox, QLineEdit, QTextEdit, \
@@ -128,6 +128,7 @@ class Widget(QWidget):
     def get_value(self, g):
         from calibre.gui2.convert.xpath_wizard import XPathEdit
         from calibre.gui2.convert.regex_builder import RegexEdit
+        from calibre.gui2.widgets import EncodingComboBox
         ret = self.get_value_handler(g)
         if ret != 'this is a dummy return value, xcswx1avcx4x':
             return ret
@@ -139,6 +140,13 @@ class Widget(QWidget):
             if not ans:
                 ans = None
             return ans
+        elif isinstance(g, EncodingComboBox):
+            ans = unicode(g.currentText()).strip()
+            try:
+                codecs.lookup(ans)
+            except:
+                ans = ''
+            return ans
         elif isinstance(g, QComboBox):
             return unicode(g.currentText())
         elif isinstance(g, QCheckBox):

From df602343b4caf21493f56997b1068d03b306ca84 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 6 Jan 2011 17:33:17 -0700
Subject: [PATCH 16/34] Walla by marbs

---
 resources/recipes/njp.recipe   |  2 +-
 resources/recipes/walla.recipe | 44 ++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 resources/recipes/walla.recipe

diff --git a/resources/recipes/njp.recipe b/resources/recipes/njp.recipe
index ed202512f2..996aef2fdf 100644
--- a/resources/recipes/njp.recipe
+++ b/resources/recipes/njp.recipe
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 __license__     = 'GPL v3'
-__copyright__   = 'Chema Cort�s - 2011-01-05'
+__copyright__   = u'Chema Cort\xe9s - 2011-01-05'
 __version__     = 'v0.01'
 __date__        = '2011-01-05'
 '''
diff --git a/resources/recipes/walla.recipe b/resources/recipes/walla.recipe
new file mode 100644
index 0000000000..5fbfed7a03
--- /dev/null
+++ b/resources/recipes/walla.recipe
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1283848012(BasicNewsRecipe):
+    description   = 'The WallaNews.'
+    cover_url      = 'http://ftp5.bizportal.co.il/web/giflib/news/rsPhoto/sz_5/rsz_220_220_logo_walla.gif'
+    title          = u'Walla'
+    language              = 'he'
+    __author__ = 'marbs'
+    extra_css='img {max-width:100%;} body{direction: rtl;},title{direction: rtl; } ,article_description{direction: rtl; }, a.article{direction: rtl; } ,calibre_feed_description{direction: rtl; }'
+    simultaneous_downloads = 5
+#    remove_javascript     = True
+    timefmt        = '[%a, %d %b, %Y]'
+    oldest_article = 1
+    max_articles_per_feed = 100
+ #   remove_attributes = ['width']
+    keep_only_tags =dict(name='div', attrs={'class':'wp-0-b w3'})
+    remove_tags = [dict(name='div', attrs={'class':'tagsContainer'})]
+    max_articles_per_feed = 100
+#    preprocess_regexps = [
+#        (re.compile(r'<p>&nbsp;</p>', re.DOTALL|re.IGNORECASE), lambda match: '')
+#        ]
+
+
+    feeds          = [(u'חדשות', u'http://rss.walla.co.il/?w=/1/0/1/@rss'),
+                           (u'עסקים', u'http://rss.walla.co.il/?w=/2/3/1/@rss'),
+                           (u'תרבות', u'http://rss.walla.co.il/?w=/4/249/1/@rss'),
+                           (u'בריאות', u'http://rss.walla.co.il/?w=/5/18/1/@rss'),
+                           (u'TECH', u'http://rss.walla.co.il/?w=/6/4/1/@rss'),
+                           (u'אסטרולוגיה', u'http://rss.walla.co.il/?w=/8/3307/1/@rss'),
+                           (u'בעלי חיים', u'http://rss.walla.co.il/?w=/59/5703/1/@rss'),
+                           (u'רכב', u'http://rss.walla.co.il/?w=/31/4700/1/@rss'),
+                           (u'סלבס', u'http://rss.walla.co.il/?w=/22/3600/1/@rss'),
+                           (u'אוכל', u'http://rss.walla.co.il/?w=/9/903/1/@rss'),
+                           (u'אופנה', u'http://rss.walla.co.il/?w=/24/2120/1/@rss'),
+                           (u'ברנזה', u'http://rss.walla.co.il/?w=/27/3900/1/@rss'),
+                           (u'ZONE', u'http://rss.walla.co.il/?w=/18/500/1/@rss'),
+                           (u'ספורט', u'http://rss.walla.co.il/?w=/3/7/1/@rss')]
+
+    def print_version(self, url):
+        print_url = url + '/@@/item/printer'
+        return print_url
+

From 332c80aa54f73aa25fb31a5e2dd0482560bff384 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 6 Jan 2011 17:35:00 -0700
Subject: [PATCH 17/34] New London Day by Being

---
 resources/recipes/new_london_day.recipe | 74 +++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 resources/recipes/new_london_day.recipe

diff --git a/resources/recipes/new_london_day.recipe b/resources/recipes/new_london_day.recipe
new file mode 100644
index 0000000000..bc8c44e40e
--- /dev/null
+++ b/resources/recipes/new_london_day.recipe
@@ -0,0 +1,74 @@
+__license__ = 'GPL 3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1294342201(BasicNewsRecipe):
+    title          = u'New London Day'
+    __author__  = 'Being'
+    description = 'State, local and business news from New London, CT'
+    language = 'en_GB'
+    oldest_article = 1
+    max_articles_per_feed = 200
+
+    use_embedded_content    = False
+    no_stylesheets        = True
+    remove_javascript = True
+    remove_tags_before = dict(id='article')
+    remove_tags_after  = dict(id='article')
+    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
+                dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
+                dict(name=['script', 'noscript', 'style'])]
+    remove_tags_after = [    {'class':['photo_article',]} ]
+    remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer"]},
+                   {'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent"]},
+                   dict(name='font',attrs={'id':["cr-other-headlines"]})]
+    extra_css = '''
+                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
+                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
+                    .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
+                    .date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
+                    p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+                    .copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center}
+                    .story{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+                    .entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+                    .pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+                    .maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+                    .story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+                    body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
+		'''
+
+    feeds = [
+            (u'All News', u'http://www.theday.com/section/rss'),
+            (u'Breaking News', u'http://www.theday.com/section/rss01'),
+            (u'Police and Courts', u'http://www.theday.com/section/rss02'),
+            (u'State News', u'http://www.theday.com/section/rss03'),
+            (u'Local Business', u'http://www.theday.com/section/rss04'),
+            (u'Entertainment', u'http://www.theday.com/section/rss05'),
+            (u'Opinion', u'http://www.theday.com/section/rss06'),
+            (u'Casinos', u'http://www.theday.com/section/rss12'),
+            (u'Defense and Military', u'http://www.theday.com/section/rss14'),
+            (u'Ann Baldelli Ruminations', u'http://www.theday.com/section/rss20'),
+            (u'Paul Choiniere Ruminations', u'http://www.theday.com/section/rss21'),
+            (u'Michael Costanza Omnivore', u'http://www.theday.com/section/rss23'),
+            (u'Rebecca Dangelo Reel Life', u'http://www.theday.com/section/rss25'),]
+
+    def print_version(self, url):
+        return url.replace('/index.html', '/print.html')
+
+    def get_article_url(self, article):
+        return article.get('feedburner_origlink', article.get('guid', article.get('link')))
+
+
+    def postprocess_html(self, soup, first_fetch):
+        for t in soup.findAll(['table', 'tr', 'td']):
+            t.name = 'div'
+
+        for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})):
+            tag.extract()
+        for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})):
+            tag.extract()
+
+        return soup
+

From 7343d48a37227ed8e9093e2ccf4aead31aac614d Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Thu, 6 Jan 2011 20:04:11 -0500
Subject: [PATCH 18/34] Remove PDB and TCR input options. TXT auto options are
 default and should suffice.

---
 src/calibre/ebooks/fb2/fb2ml.py       |  2 +-
 src/calibre/ebooks/pdb/input.py       | 24 ------------------------
 src/calibre/ebooks/tcr/input.py       | 24 ------------------------
 src/calibre/gui2/convert/pdb_input.py | 25 -------------------------
 src/calibre/gui2/convert/tcr_input.py | 25 -------------------------
 5 files changed, 1 insertion(+), 99 deletions(-)
 delete mode 100644 src/calibre/gui2/convert/pdb_input.py
 delete mode 100644 src/calibre/gui2/convert/tcr_input.py

diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py
index f6deab677a..4dd6e7c7ae 100644
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@@ -173,7 +173,7 @@ class FB2MLizer(object):
             if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml':
                 self.oeb_book.spine.insert(0, title_item, True)
         # Create xhtml page to reference cover image so it can be used.
-        if self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
+        if not title_name and self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
             id = unicode(self.oeb_book.metadata.cover[0])
             cover_item = self.oeb_book.manifest.ids[id]
             if cover_item.media_type in OEB_RASTER_IMAGES:
diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py
index 3688abff3f..8c754782a2 100644
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@@ -18,30 +18,6 @@ class PDBInput(InputFormatPlugin):
     description = 'Convert PDB to HTML'
     file_types  = set(['pdb'])
 
-    options = set([
-        OptionRecommendation(name='paragraph_type', recommended_value='auto',
-            choices=['auto', 'block', 'single', 'print'],
-            help=_('Paragraph structure.\n'
-                   'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
-                   '* auto: Try to auto detect paragraph type.\n'
-                   '* block: Treat a blank line as a paragraph break.\n'
-                   '* single: Assume every line is a paragraph.\n'
-                   '* print:  Assume every line starting with 2+ spaces or a tab '
-                   'starts a paragraph.')),
-        OptionRecommendation(name='formatting_type', recommended_value='auto',
-            choices=['auto', 'none', 'markdown'],
-            help=_('Formatting used within the document.'
-                   '* auto: Try to auto detect the document formatting.\n'
-                   '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
-                   '* markdown: Run the input though the markdown pre-processor. '
-                   'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
-        OptionRecommendation(name='preserve_spaces', recommended_value=False,
-            help=_('Normally extra spaces are condensed into a single space. '
-                'With this option all spaces will be displayed.')),
-        OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
-            help=_('Do not insert a Table of Contents into the output text.')),
-    ])
-
     def convert(self, stream, options, file_ext, log,
                 accelerators):
         header = PdbHeaderReader(stream)
diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py
index c1dcef235d..4c759c5be2 100644
--- a/src/calibre/ebooks/tcr/input.py
+++ b/src/calibre/ebooks/tcr/input.py
@@ -16,30 +16,6 @@ class TCRInput(InputFormatPlugin):
     description = 'Convert TCR files to HTML'
     file_types  = set(['tcr'])
 
-    options = set([
-        OptionRecommendation(name='paragraph_type', recommended_value='auto',
-            choices=['auto', 'block', 'single', 'print'],
-            help=_('Paragraph structure.\n'
-                   'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
-                   '* auto: Try to auto detect paragraph type.\n'
-                   '* block: Treat a blank line as a paragraph break.\n'
-                   '* single: Assume every line is a paragraph.\n'
-                   '* print:  Assume every line starting with 2+ spaces or a tab '
-                   'starts a paragraph.')),
-        OptionRecommendation(name='formatting_type', recommended_value='auto',
-            choices=['auto', 'none', 'markdown'],
-            help=_('Formatting used within the document.'
-                   '* auto: Try to auto detect the document formatting.\n'
-                   '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
-                   '* markdown: Run the input though the markdown pre-processor. '
-                   'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
-        OptionRecommendation(name='preserve_spaces', recommended_value=False,
-            help=_('Normally extra spaces are condensed into a single space. '
-                'With this option all spaces will be displayed.')),
-        OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
-            help=_('Do not insert a Table of Contents into the output text.')),
-    ])
-
     def convert(self, stream, options, file_ext, log, accelerators):
         log.info('Decompressing text...')
         raw_txt = decompress(stream)
diff --git a/src/calibre/gui2/convert/pdb_input.py b/src/calibre/gui2/convert/pdb_input.py
deleted file mode 100644
index 16ff1ff236..0000000000
--- a/src/calibre/gui2/convert/pdb_input.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# -*- coding: utf-8 -*-
-
-__license__ = 'GPL 3'
-__copyright__ = '2011, John Schember <john@nachtimwald.com>'
-__docformat__ = 'restructuredtext en'
-
-from calibre.gui2.convert.txt_input_ui import Ui_Form
-from calibre.gui2.convert import Widget
-
-class PluginWidget(Widget, Ui_Form):
-
-    TITLE = _('PDB Input')
-    HELP = _('Options specific to')+' PDB '+_('input')
-    COMMIT_NAME = 'pdb_input'
-    ICON = I('mimetypes/txt.png')
-
-    def __init__(self, parent, get_option, get_help, db=None, book_id=None):
-        Widget.__init__(self, parent,
-            ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
-        self.db, self.book_id = db, book_id
-        for x in get_option('paragraph_type').option.choices:
-            self.opt_paragraph_type.addItem(x)
-        for x in get_option('formatting_type').option.choices:
-            self.opt_formatting_type.addItem(x)
-        self.initialize_options(get_option, get_help, db, book_id)
diff --git a/src/calibre/gui2/convert/tcr_input.py b/src/calibre/gui2/convert/tcr_input.py
deleted file mode 100644
index 366643ad5b..0000000000
--- a/src/calibre/gui2/convert/tcr_input.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# -*- coding: utf-8 -*-
-
-__license__ = 'GPL 3'
-__copyright__ = '2011, John Schember <john@nachtimwald.com>'
-__docformat__ = 'restructuredtext en'
-
-from calibre.gui2.convert.txt_input_ui import Ui_Form
-from calibre.gui2.convert import Widget
-
-class PluginWidget(Widget, Ui_Form):
-
-    TITLE = _('TCR Input')
-    HELP = _('Options specific to')+' TCR '+_('input')
-    COMMIT_NAME = 'tcr_input'
-    ICON = I('mimetypes/txt.png')
-
-    def __init__(self, parent, get_option, get_help, db=None, book_id=None):
-        Widget.__init__(self, parent,
-            ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
-        self.db, self.book_id = db, book_id
-        for x in get_option('paragraph_type').option.choices:
-            self.opt_paragraph_type.addItem(x)
-        for x in get_option('formatting_type').option.choices:
-            self.opt_formatting_type.addItem(x)
-        self.initialize_options(get_option, get_help, db, book_id)

From 1786820728f1d69d2f5c5bf2ffd4d8f50f4b0219 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Thu, 6 Jan 2011 20:07:09 -0500
Subject: [PATCH 19/34] PDB PDF Input: Dynamically set options based on PDF
 plugin.

---
 src/calibre/ebooks/pdb/pdf/reader.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py
index c151551866..30b0c4c57c 100644
--- a/src/calibre/ebooks/pdb/pdf/reader.py
+++ b/src/calibre/ebooks/pdb/pdf/reader.py
@@ -19,9 +19,6 @@ class Reader(FormatReader):
         self.stream = stream
         self.log = log
         self.options = options
-        setattr(self.options, 'new_pdf_engine', False)
-        setattr(self.options, 'no_images', False)
-        setattr(self.options, 'unwrap_factor', 0.45)
 
     def extract_content(self, output_dir):
         self.log.info('Extracting PDF...')
@@ -31,7 +28,12 @@ class Reader(FormatReader):
             for x in xrange(self.header.section_count()):
                 pdf.write(self.header.section_data(x))
 
-            from calibre.customize.ui import plugin_for_input_format
-            pdf.seek(0)
-            return plugin_for_input_format('pdf').convert(pdf, self.options,
-                'pdf', self.log, [])
+        from calibre.customize.ui import plugin_for_input_format
+
+        pdf_plugin = plugin_for_input_format('pdf')
+        for option in pdf_plugin.options:
+            if not hasattr(self.options, option.option.name):
+                setattr(self.options, option.name, option.recommended_value)
+
+        pdf.seek(0)
+        return pdf_plugin.convert(pdf, self.options, 'pdf', self.log, {})

From b5599f8ff2a9006d4312a9c88451afaf6001e41d Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Thu, 6 Jan 2011 20:51:28 -0500
Subject: [PATCH 20/34] Fix indents.

---
 src/calibre/ebooks/conversion/preprocess.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 3ff816b3bf..9a27274dd8 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -51,16 +51,16 @@ def chap_head(match):
     chap = match.group('chap')
     title = match.group('title')
     if not title:
-               return '<h1>'+chap+'</h1><br/>\n'
+        return '<h1>'+chap+'</h1><br/>\n'
     else:
-               return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n'
+        return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n'
 
 def wrap_lines(match):
     ital = match.group('ital')
     if not ital:
-               return ' '
+        return ' '
     else:
-               return ital+' '
+        return ital+' '
 
 class DocAnalysis(object):
     '''
@@ -191,7 +191,7 @@ class Dehyphenator(object):
         dehyphenated = unicode(firsthalf) + unicode(secondhalf)
         lookupword = self.removesuffixes.sub('', dehyphenated)
         if self.prefixes.match(firsthalf) is None:
-           lookupword = self.removeprefix.sub('', lookupword)
+            lookupword = self.removeprefix.sub('', lookupword)
         #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
         try:
             searchresult = self.html.find(lookupword.lower())

From c7332d3651a54bf3d9a5890f08af0c6de6776acb Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Thu, 6 Jan 2011 20:57:03 -0500
Subject: [PATCH 21/34] Fix indents.

---
 src/calibre/ebooks/conversion/utils.py | 40 +++++++++++++-------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 11979b933c..a76ec8675d 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -113,24 +113,24 @@ class PreProcessor(object):
         # some lit files don't have any <p> tags or equivalent (generally just plain text between
         # <pre> tags), check and  mark up line endings if required before proceeding
         if self.no_markup(html, 0.1):
-             self.log("not enough paragraph markers, adding now")
-             # check if content is in pre tags, use txt processor to mark up if so
-             pre = re.compile(r'<pre>', re.IGNORECASE)
-             if len(pre.findall(html)) == 1:
-                 self.log("Running Text Processing")
-                 from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
-                 separate_paragraphs_single_line
-                 outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
-                 html = outerhtml.sub('\g<text>', html)
-                 html = separate_paragraphs_single_line(html)
-                 html = preserve_spaces(html)
-                 html = convert_basic(html, epub_split_size_kb=0)
-             else:
-                 # Add markup naively
-                 # TODO - find out if there are cases where there are more than one <pre> tag or
-                 # other types of unmarked html and handle them in some better fashion
-                 add_markup = re.compile('(?<!>)(\n)')
-                 html = add_markup.sub('</p>\n<p>', html)
+            self.log("not enough paragraph markers, adding now")
+            # check if content is in pre tags, use txt processor to mark up if so
+            pre = re.compile(r'<pre>', re.IGNORECASE)
+            if len(pre.findall(html)) == 1:
+                self.log("Running Text Processing")
+                from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
+                separate_paragraphs_single_line
+                outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
+                html = outerhtml.sub('\g<text>', html)
+                html = separate_paragraphs_single_line(html)
+                html = preserve_spaces(html)
+                html = convert_basic(html, epub_split_size_kb=0)
+            else:
+                # Add markup naively
+                # TODO - find out if there are cases where there are more than one <pre> tag or
+                # other types of unmarked html and handle them in some better fashion
+                add_markup = re.compile('(?<!>)(\n)')
+                html = add_markup.sub('</p>\n<p>', html)
 
         ###### Mark Indents/Cleanup ######
         #
@@ -164,8 +164,8 @@ class PreProcessor(object):
                 self.log("deleting blank lines")
                 html = blankreg.sub('', html)
             elif float(len(blanklines)) / float(len(lines)) > 0.40:
-               blanks_between_paragraphs = True
-               #print "blanks between paragraphs is marked True"
+                blanks_between_paragraphs = True
+                #print "blanks between paragraphs is marked True"
             else:
                 blanks_between_paragraphs = False
         #self.dump(html, 'before_chapter_markup')

From 6a407327118744fe93aef1b0cd45e4368ff6f017 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 6 Jan 2011 20:34:23 -0700
Subject: [PATCH 22/34] ...

---
 src/calibre/gui2/book_details.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/calibre/gui2/book_details.py b/src/calibre/gui2/book_details.py
index 8e3e8b10de..2f7892692c 100644
--- a/src/calibre/gui2/book_details.py
+++ b/src/calibre/gui2/book_details.py
@@ -256,8 +256,10 @@ class BookInfo(QWebView):
                     % (left_pane, right_pane)))
 
     def mouseDoubleClickEvent(self, ev):
-        if self.width() - ev.x() < 25 or \
-            self.height() - ev.y() < 25:
+        swidth = self.page().mainFrame().scrollBarGeometry(Qt.Vertical).width()
+        sheight = self.page().mainFrame().scrollBarGeometry(Qt.Horizontal).height()
+        if self.width() - ev.x() < swidth or \
+            self.height() - ev.y() < sheight:
             # Filter out double clicks on the scroll bar
             ev.accept()
         else:

From 32c1ef8ef6a964d5e42528953c35d413c5c0d9c2 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 6 Jan 2011 20:41:38 -0700
Subject: [PATCH 23/34] E-book viewer: Fix next page skipping the bottom of
 chapters when the content is wider than the window. Fixes #8153 (Viewer in
 0.7.36 cutting off end of chapters.)

---
 src/calibre/gui2/viewer/documentview.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/calibre/gui2/viewer/documentview.py b/src/calibre/gui2/viewer/documentview.py
index f77f23c154..13469f5622 100644
--- a/src/calibre/gui2/viewer/documentview.py
+++ b/src/calibre/gui2/viewer/documentview.py
@@ -449,7 +449,7 @@ class Document(QWebPage): # {{{
         return self.mainFrame().contentsSize().width() # offsetWidth gives inaccurate results
 
     def set_bottom_padding(self, amount):
-        s = QSize(-1, -1) if amount == 0 else QSize(self.width,
+        s = QSize(-1, -1) if amount == 0 else QSize(self.viewportSize().width(),
                 self.height+amount)
         self.setPreferredContentsSize(s)
 
@@ -820,6 +820,7 @@ class DocumentView(QWebView): # {{{
                         self.flipper.initialize(self.current_page_image())
                     self.manager.next_document()
                 return
+            #oheight = self.document.height
             lower_limit = opos + delta_y # Max value of top y co-ord after scrolling
             max_y = self.document.height - window_height # The maximum possible top y co-ord
             if max_y < lower_limit:
@@ -835,6 +836,7 @@ class DocumentView(QWebView): # {{{
             if epf:
                 self.flipper.initialize(self.current_page_image())
             #print 'Document height:', self.document.height
+            #print 'Height change:', (self.document.height - oheight)
             max_y = self.document.height - window_height
             lower_limit = min(max_y, lower_limit)
             #print 'Scroll to:', lower_limit
@@ -842,6 +844,7 @@ class DocumentView(QWebView): # {{{
                 self.document.scroll_to(self.document.xpos, lower_limit)
             actually_scrolled = self.document.ypos - opos
             #print 'After scroll pos:', self.document.ypos
+            #print 'Scrolled by:', self.document.ypos - opos
             self.find_next_blank_line(window_height - actually_scrolled)
             #print 'After blank line pos:', self.document.ypos
             if epf:

From dcb425ebbf81b068b1e4c679c060ee205d967d71 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 6 Jan 2011 20:56:32 -0700
Subject: [PATCH 24/34] E-book viewer: When scrolling to a bookmark and the
 content is wider than the window, do not scroll in the horizontal direction

---
 resources/viewer/bookmarks.js | 1 +
 1 file changed, 1 insertion(+)

diff --git a/resources/viewer/bookmarks.js b/resources/viewer/bookmarks.js
index d36e7c579a..467e4d9d38 100644
--- a/resources/viewer/bookmarks.js
+++ b/resources/viewer/bookmarks.js
@@ -41,6 +41,7 @@ function scroll_to_bookmark(bookmark) {
     $.scrollTo($(bm[0]), 1000,
         {
             over:ratio,
+            axis: 'y',
             onAfter:function(){window.py_bridge.animated_scroll_done()}
         }
     );

From cff2e9b34793c11e4c2e677848f2a719e95dcfc9 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 6 Jan 2011 20:57:05 -0700
Subject: [PATCH 25/34] ...

---
 resources/viewer/bookmarks.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/viewer/bookmarks.js b/resources/viewer/bookmarks.js
index 467e4d9d38..253524326f 100644
--- a/resources/viewer/bookmarks.js
+++ b/resources/viewer/bookmarks.js
@@ -41,7 +41,7 @@ function scroll_to_bookmark(bookmark) {
     $.scrollTo($(bm[0]), 1000,
         {
             over:ratio,
-            axis: 'y',
+            axis: 'y', // Do not scroll in the x direction
             onAfter:function(){window.py_bridge.animated_scroll_done()}
         }
     );

From 9ee7c1f0d29a97f208c22193155b04affd1559c0 Mon Sep 17 00:00:00 2001
From: Translators <>
Date: Fri, 7 Jan 2011 04:59:33 +0000
Subject: [PATCH 26/34] Launchpad automatic translations update.

---
 src/calibre/translations/cs.po    | 18 ++++----
 src/calibre/translations/de.po    | 10 ++---
 src/calibre/translations/gl.po    | 29 ++++++++-----
 src/calibre/translations/it.po    |  8 ++--
 src/calibre/translations/nds.po   |  6 +--
 src/calibre/translations/pt_BR.po | 72 ++++++++++++++++++++-----------
 6 files changed, 87 insertions(+), 56 deletions(-)

diff --git a/src/calibre/translations/cs.po b/src/calibre/translations/cs.po
index 77231346a8..3d4de14c39 100644
--- a/src/calibre/translations/cs.po
+++ b/src/calibre/translations/cs.po
@@ -8,13 +8,13 @@ msgstr ""
 "Project-Id-Version: calibre\n"
 "Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
 "POT-Creation-Date: 2011-01-02 23:55+0000\n"
-"PO-Revision-Date: 2011-01-04 08:51+0000\n"
-"Last-Translator: TomVal <Unknown>\n"
+"PO-Revision-Date: 2011-01-06 11:10+0000\n"
+"Last-Translator: schunka <Unknown>\n"
 "Language-Team: Czech <cs@li.org>\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=UTF-8\n"
 "Content-Transfer-Encoding: 8bit\n"
-"X-Launchpad-Export-Date: 2011-01-05 04:43+0000\n"
+"X-Launchpad-Export-Date: 2011-01-07 04:57+0000\n"
 "X-Generator: Launchpad (build Unknown)\n"
 
 #: /home/kovid/work/calibre/src/calibre/customize/__init__.py:43
@@ -799,7 +799,7 @@ msgstr "Spojit se s Sanda Bambook eBook čtečkou"
 
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:25
 msgid "Li Fanxi"
-msgstr ""
+msgstr "Li Fanxi"
 
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:41
 msgid "Device IP Address (restart calibre after changing)"
@@ -1126,11 +1126,11 @@ msgstr "Komunikovat se zařízením Trekstor"
 
 #: /home/kovid/work/calibre/src/calibre/devices/misc.py:251
 msgid "Communicate with the EEE Reader"
-msgstr ""
+msgstr "Probíhá spojení se čtečkou EEE Reader."
 
 #: /home/kovid/work/calibre/src/calibre/devices/misc.py:271
 msgid "Communicate with the Nextbook Reader"
-msgstr ""
+msgstr "Probíhá spojení se čtečkou Nextbook Reader."
 
 #: /home/kovid/work/calibre/src/calibre/devices/nokia/driver.py:17
 msgid "Communicate with the Nokia 770 internet tablet."
@@ -1174,11 +1174,11 @@ msgstr "Spojit se se Sony eBook reader"
 
 #: /home/kovid/work/calibre/src/calibre/devices/prs505/driver.py:61
 msgid "All by title"
-msgstr ""
+msgstr "Vše podle názvu"
 
 #: /home/kovid/work/calibre/src/calibre/devices/prs505/driver.py:62
 msgid "All by author"
-msgstr ""
+msgstr "Vše podle autora"
 
 #: /home/kovid/work/calibre/src/calibre/devices/prs505/driver.py:64
 msgid ""
@@ -1226,7 +1226,7 @@ msgstr "Spojit se se Sovos reader."
 
 #: /home/kovid/work/calibre/src/calibre/devices/teclast/driver.py:78
 msgid "Communicate with the Sunstech EB700 reader."
-msgstr ""
+msgstr "Probíhá spojení se čtečkou Sunstech EB700."
 
 #: /home/kovid/work/calibre/src/calibre/devices/usbms/device.py:258
 msgid "Unable to detect the %s disk drive. Try rebooting."
diff --git a/src/calibre/translations/de.po b/src/calibre/translations/de.po
index 6418ab3d7d..a330704198 100644
--- a/src/calibre/translations/de.po
+++ b/src/calibre/translations/de.po
@@ -8,13 +8,13 @@ msgstr ""
 "Project-Id-Version: de\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2011-01-02 23:55+0000\n"
-"PO-Revision-Date: 2011-01-01 21:21+0000\n"
-"Last-Translator: Kovid Goyal <Unknown>\n"
+"PO-Revision-Date: 2011-01-07 02:17+0000\n"
+"Last-Translator: heinz beck <Unknown>\n"
 "Language-Team: American English <kde-i18n-doc@lists.kde.org>\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=UTF-8\n"
 "Content-Transfer-Encoding: 8bit\n"
-"X-Launchpad-Export-Date: 2011-01-04 13:52+0000\n"
+"X-Launchpad-Export-Date: 2011-01-07 04:58+0000\n"
 "X-Generator: Launchpad (build Unknown)\n"
 "Generated-By: pygettext.py 1.5\n"
 
@@ -943,7 +943,7 @@ msgstr "Kommunikation mit dem PocketBook 301 Reader."
 
 #: /home/kovid/work/calibre/src/calibre/devices/eb600/driver.py:233
 msgid "Communicate with the PocketBook 602/603/902/903 reader."
-msgstr ""
+msgstr "verbinden mit PocketBook 602/603/902/903"
 
 #: /home/kovid/work/calibre/src/calibre/devices/eb600/driver.py:252
 msgid "Communicate with the PocketBook 701"
@@ -1186,7 +1186,7 @@ msgstr "Kommunikation mit allen Sony eBook Readern."
 
 #: /home/kovid/work/calibre/src/calibre/devices/prs505/driver.py:61
 msgid "All by title"
-msgstr ""
+msgstr "nach Titel"
 
 #: /home/kovid/work/calibre/src/calibre/devices/prs505/driver.py:62
 msgid "All by author"
diff --git a/src/calibre/translations/gl.po b/src/calibre/translations/gl.po
index 33708ef88c..597487b7dc 100644
--- a/src/calibre/translations/gl.po
+++ b/src/calibre/translations/gl.po
@@ -8,13 +8,13 @@ msgstr ""
 "Project-Id-Version: calibre\n"
 "Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
 "POT-Creation-Date: 2011-01-02 23:55+0000\n"
-"PO-Revision-Date: 2011-01-02 13:21+0000\n"
-"Last-Translator: Calidonia Hibernia <Unknown>\n"
+"PO-Revision-Date: 2011-01-06 14:46+0000\n"
+"Last-Translator: Antón Méixome <meixome@gmail.com>\n"
 "Language-Team: dev@gl.openoffice.org\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=UTF-8\n"
 "Content-Transfer-Encoding: 8bit\n"
-"X-Launchpad-Export-Date: 2011-01-04 13:52+0000\n"
+"X-Launchpad-Export-Date: 2011-01-07 04:58+0000\n"
 "X-Generator: Launchpad (build Unknown)\n"
 "Language: gl\n"
 
@@ -5749,7 +5749,7 @@ msgstr "Tamaño da mensaxe para a descrición das miniaturas de portada"
 
 #: /home/kovid/work/calibre/src/calibre/gui2/catalog/catalog_epub_mobi_ui.py:330
 msgid " inch"
-msgstr ""
+msgstr " polgada"
 
 #: /home/kovid/work/calibre/src/calibre/gui2/catalog/catalog_epub_mobi_ui.py:331
 msgid "&Description note"
@@ -10645,15 +10645,15 @@ msgstr "Nunca"
 
 #: /home/kovid/work/calibre/src/calibre/gui2/preferences/look_feel.py:60
 msgid "By first letter"
-msgstr ""
+msgstr "Pola primeira letra"
 
 #: /home/kovid/work/calibre/src/calibre/gui2/preferences/look_feel.py:60
 msgid "Disabled"
-msgstr ""
+msgstr "Desactivado"
 
 #: /home/kovid/work/calibre/src/calibre/gui2/preferences/look_feel.py:61
 msgid "Partitioned"
-msgstr ""
+msgstr "Particionado"
 
 #: /home/kovid/work/calibre/src/calibre/gui2/preferences/look_feel_ui.py:140
 msgid "User Interface &layout (needs restart):"
@@ -10709,7 +10709,7 @@ msgstr "Buscar mentres se escribe"
 
 #: /home/kovid/work/calibre/src/calibre/gui2/preferences/look_feel_ui.py:152
 msgid "Tags browser category partitioning method:"
-msgstr ""
+msgstr "Método de particionado con categorías de etiquetas de navegación:"
 
 #: /home/kovid/work/calibre/src/calibre/gui2/preferences/look_feel_ui.py:153
 msgid ""
@@ -10719,10 +10719,15 @@ msgid ""
 "have a list of fixed-sized groups. Set to disabled\n"
 "if you never want subcategories"
 msgstr ""
+"Escoller como as subcategorías de etiquetas de navegación se amosan cando\n"
+"hai máis ítems que os do límite. Seleccione por primeira\n"
+"letra para ver unha lista A, B, C. Escolla particionado para\n"
+"ter unha lista de grupos de tamaño fixo. Escolla desactivado\n"
+"se non vai querer nunca subcategorías"
 
 #: /home/kovid/work/calibre/src/calibre/gui2/preferences/look_feel_ui.py:158
 msgid "Collapse when more items than:"
-msgstr ""
+msgstr "Colapsar cando os ítems son máis de:"
 
 #: /home/kovid/work/calibre/src/calibre/gui2/preferences/look_feel_ui.py:159
 msgid ""
@@ -10730,6 +10735,10 @@ msgid ""
 "up into sub-categories. If the partition method is set to disable, this "
 "value is ignored."
 msgstr ""
+"Se unha categoría de etiquetas de navegación ten máis ca este número de "
+"ítems, divídese\n"
+"en subcategorías. Se o método de partición se pon como desactivado, "
+"ignorarase este valor."
 
 #: /home/kovid/work/calibre/src/calibre/gui2/preferences/look_feel_ui.py:161
 msgid "&Toolbar"
@@ -11494,7 +11503,7 @@ msgstr "Mostrar todas as categorías"
 
 #: /home/kovid/work/calibre/src/calibre/gui2/tag_view.py:300
 msgid "Change sub-categorization scheme"
-msgstr ""
+msgstr "Cambiar o esquema de subcategorización"
 
 #: /home/kovid/work/calibre/src/calibre/gui2/tag_view.py:625
 msgid ""
diff --git a/src/calibre/translations/it.po b/src/calibre/translations/it.po
index 73a13b051e..1be0988afd 100644
--- a/src/calibre/translations/it.po
+++ b/src/calibre/translations/it.po
@@ -9,13 +9,13 @@ msgstr ""
 "Project-Id-Version: calibre_calibre-it\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2011-01-02 23:55+0000\n"
-"PO-Revision-Date: 2011-01-02 22:45+0000\n"
-"Last-Translator: Marco Ciampa <ciampix@libero.it>\n"
+"PO-Revision-Date: 2011-01-06 15:33+0000\n"
+"Last-Translator: Francesco Pasa <Unknown>\n"
 "Language-Team: italiano\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=UTF-8\n"
 "Content-Transfer-Encoding: 8bit\n"
-"X-Launchpad-Export-Date: 2011-01-04 13:53+0000\n"
+"X-Launchpad-Export-Date: 2011-01-07 04:58+0000\n"
 "X-Generator: Launchpad (build Unknown)\n"
 "X-Poedit-Bookmarks: -1,-1,-1,-1,-1,1105,-1,1312,-1,-1\n"
 "Generated-By: pygettext.py 1.5\n"
@@ -5694,7 +5694,7 @@ msgstr ""
 
 #: /home/kovid/work/calibre/src/calibre/gui2/catalog/catalog_epub_mobi_ui.py:330
 msgid " inch"
-msgstr ""
+msgstr " pollice"
 
 #: /home/kovid/work/calibre/src/calibre/gui2/catalog/catalog_epub_mobi_ui.py:331
 msgid "&Description note"
diff --git a/src/calibre/translations/nds.po b/src/calibre/translations/nds.po
index e4d1ad8f1a..80d6f376c3 100644
--- a/src/calibre/translations/nds.po
+++ b/src/calibre/translations/nds.po
@@ -8,13 +8,13 @@ msgstr ""
 "Project-Id-Version: nds\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2011-01-02 23:55+0000\n"
-"PO-Revision-Date: 2010-10-18 00:57+0000\n"
-"Last-Translator: Nils-Christoph Fiedler <ncfiedler@gnome.org>\n"
+"PO-Revision-Date: 2011-01-07 02:48+0000\n"
+"Last-Translator: heinz beck <Unknown>\n"
 "Language-Team: German\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=UTF-8\n"
 "Content-Transfer-Encoding: 8bit\n"
-"X-Launchpad-Export-Date: 2011-01-04 13:55+0000\n"
+"X-Launchpad-Export-Date: 2011-01-07 04:59+0000\n"
 "X-Generator: Launchpad (build Unknown)\n"
 "X-Poedit-Country: GERMANY\n"
 "X-Poedit-Language: German\n"
diff --git a/src/calibre/translations/pt_BR.po b/src/calibre/translations/pt_BR.po
index af6071797c..26d16546e6 100644
--- a/src/calibre/translations/pt_BR.po
+++ b/src/calibre/translations/pt_BR.po
@@ -8,13 +8,13 @@ msgstr ""
 "Project-Id-Version: calibre\n"
 "Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
 "POT-Creation-Date: 2011-01-02 23:55+0000\n"
-"PO-Revision-Date: 2010-12-18 05:47+0000\n"
-"Last-Translator: Kovid Goyal <Unknown>\n"
+"PO-Revision-Date: 2011-01-06 13:01+0000\n"
+"Last-Translator: MoroniGranja <Unknown>\n"
 "Language-Team: American English <kde-i18n-doc@kde.org>\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=UTF-8\n"
 "Content-Transfer-Encoding: 8bit\n"
-"X-Launchpad-Export-Date: 2011-01-04 14:00+0000\n"
+"X-Launchpad-Export-Date: 2011-01-07 04:59+0000\n"
 "X-Generator: Launchpad (build Unknown)\n"
 
 #: /home/kovid/work/calibre/src/calibre/customize/__init__.py:43
@@ -172,7 +172,7 @@ msgstr "Leitor de metadados"
 
 #: /home/kovid/work/calibre/src/calibre/customize/__init__.py:266
 msgid "Metadata writer"
-msgstr ""
+msgstr "Escritor de metadata"
 
 #: /home/kovid/work/calibre/src/calibre/customize/__init__.py:296
 msgid "Catalog generator"
@@ -589,6 +589,8 @@ msgid ""
 "Intended for the Samsung Galaxy and similar tablet devices with a resolution "
 "of 600x1280"
 msgstr ""
+"Planejado para o Samsung Galaxy e tablets similares com uma resolução "
+"de600x1280"
 
 #: /home/kovid/work/calibre/src/calibre/customize/profiles.py:471
 msgid "This profile is intended for the Kobo Reader."
@@ -695,7 +697,7 @@ msgstr "Desabilitar a extensão com nome"
 
 #: /home/kovid/work/calibre/src/calibre/debug.py:148
 msgid "Debug log"
-msgstr ""
+msgstr "Log de Debug"
 
 #: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:13
 msgid "Communicate with Android phones."
@@ -808,7 +810,7 @@ msgstr "Comunicar com iTunes."
 
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:24
 msgid "Communicate with the Sanda Bambook eBook reader."
-msgstr ""
+msgstr "Comunicar com o leitor de eBooks Sanda Bambook"
 
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:25
 msgid "Li Fanxi"
@@ -817,17 +819,22 @@ msgstr ""
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:41
 msgid "Device IP Address (restart calibre after changing)"
 msgstr ""
+"Endereço IP do dispositivo (é necessário reiniciar calibre após modificar)"
 
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:46
 msgid ""
 "Unable to add book to library directly from Bambook. Please save the book to "
 "disk and add the file to library from disk."
 msgstr ""
+"Impossível adicionar livro a biblioteca diretamente do Bambook. Favor salvar "
+"o livro no disco e adicionar o arquivo do disco a biblioteca."
 
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:66
 msgid ""
 "Unable to connect to Bambook, you need to install Bambook library first."
 msgstr ""
+"Não foi possível conectar ao Bambook, é necessário instalar a biblioteca "
+"Bambook."
 
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:74
 msgid ""
@@ -835,10 +842,13 @@ msgid ""
 "If you are trying to connect via Wi-Fi, please make sure the IP address of "
 "Bambook has been correctly configured."
 msgstr ""
+"Não foi possível conectar ao Bambook. \n"
+"Se você está tentando conectar por Wi-Fi, favor confirmar se o endereço IP "
+"do Bambook foi configurado corretamente."
 
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:111
 msgid "Bambook"
-msgstr ""
+msgstr "Bambook"
 
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:217
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:233
@@ -899,7 +909,7 @@ msgstr "Enviando metadados ao dispositivo..."
 
 #: /home/kovid/work/calibre/src/calibre/devices/bambook/libbambookcore.py:132
 msgid "Bambook SDK has not been installed."
-msgstr ""
+msgstr "Bambook SDK não foi instalado."
 
 #: /home/kovid/work/calibre/src/calibre/devices/binatone/driver.py:17
 msgid "Communicate with the Binatone Readme eBook reader."
@@ -938,11 +948,11 @@ msgstr "Comunica-se com o leitor PocketBook 301"
 
 #: /home/kovid/work/calibre/src/calibre/devices/eb600/driver.py:233
 msgid "Communicate with the PocketBook 602/603/902/903 reader."
-msgstr ""
+msgstr "Comunicar-se com o PocketBook 602/603/902/903 reader."
 
 #: /home/kovid/work/calibre/src/calibre/devices/eb600/driver.py:252
 msgid "Communicate with the PocketBook 701"
-msgstr ""
+msgstr "Comunicar-se com o PocketBook 701"
 
 #: /home/kovid/work/calibre/src/calibre/devices/edge/driver.py:17
 msgid "Entourage Edge"
@@ -1069,6 +1079,8 @@ msgid ""
 "The Kobo supports only one collection currently: the \"Im_Reading\" list.  "
 "Create a tag called \"Im_Reading\" "
 msgstr ""
+"O Kobo aceita apenas uma coleção atualmente: a lista \"Estou_Lendo\". Crie "
+"uma tag chamada \"Estou_Lendo\" "
 
 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:446
 #: /home/kovid/work/calibre/src/calibre/gui2/actions/add.py:279
@@ -1097,7 +1109,7 @@ msgstr "Comunicar com o Sweex MM300"
 
 #: /home/kovid/work/calibre/src/calibre/devices/misc.py:79
 msgid "Communicate with the Digma Q600"
-msgstr ""
+msgstr "Comunicar-se com o Digma Q600"
 
 #: /home/kovid/work/calibre/src/calibre/devices/misc.py:88
 msgid "Communicate with the Kogan"
@@ -1110,7 +1122,7 @@ msgstr "Comunicar com o Pandigital Novel"
 
 #: /home/kovid/work/calibre/src/calibre/devices/misc.py:142
 msgid "Communicate with the VelocityMicro"
-msgstr ""
+msgstr "Comunicar-se com o VelocityMicro"
 
 #: /home/kovid/work/calibre/src/calibre/devices/misc.py:160
 msgid "Communicate with the GM2000"
@@ -1118,23 +1130,23 @@ msgstr "Comunicar com o GM2000"
 
 #: /home/kovid/work/calibre/src/calibre/devices/misc.py:180
 msgid "Communicate with the Acer Lumiread"
-msgstr ""
+msgstr "Comunicar-se com o Acer Lumiread"
 
 #: /home/kovid/work/calibre/src/calibre/devices/misc.py:211
 msgid "Communicate with the Aluratek Color"
-msgstr ""
+msgstr "Comunicar-se com o Acer Lumiread"
 
 #: /home/kovid/work/calibre/src/calibre/devices/misc.py:231
 msgid "Communicate with the Trekstor"
-msgstr ""
+msgstr "Comunicar-se com o Trekstor"
 
 #: /home/kovid/work/calibre/src/calibre/devices/misc.py:251
 msgid "Communicate with the EEE Reader"
-msgstr ""
+msgstr "Comunicar-se com o EEE Reader"
 
 #: /home/kovid/work/calibre/src/calibre/devices/misc.py:271
 msgid "Communicate with the Nextbook Reader"
-msgstr ""
+msgstr "Comunicar-se com o Nextbook Reader"
 
 #: /home/kovid/work/calibre/src/calibre/devices/nokia/driver.py:17
 msgid "Communicate with the Nokia 770 internet tablet."
@@ -1142,7 +1154,7 @@ msgstr "Comunica-se com o Nokia 770 Internet Tablet."
 
 #: /home/kovid/work/calibre/src/calibre/devices/nokia/driver.py:40
 msgid "Communicate with the Nokia 810/900 internet tablet."
-msgstr ""
+msgstr "Comunicar-se com o internet tablet Nokia 810/900."
 
 #: /home/kovid/work/calibre/src/calibre/devices/nokia/driver.py:74
 msgid "Communicate with the Nokia E52"
@@ -1158,11 +1170,11 @@ msgstr "Comunica-se com o leitor Nook."
 
 #: /home/kovid/work/calibre/src/calibre/devices/nook/driver.py:85
 msgid "Nook Color"
-msgstr ""
+msgstr "Nook Color"
 
 #: /home/kovid/work/calibre/src/calibre/devices/nook/driver.py:86
 msgid "Communicate with the Nook Color eBook reader."
-msgstr ""
+msgstr "Comunicar-se com o Nook Color."
 
 #: /home/kovid/work/calibre/src/calibre/devices/nuut2/driver.py:17
 msgid "Communicate with the Nuut2 eBook reader."
@@ -1178,11 +1190,11 @@ msgstr "Comunica-se com todos os leitores da Sony."
 
 #: /home/kovid/work/calibre/src/calibre/devices/prs505/driver.py:61
 msgid "All by title"
-msgstr ""
+msgstr "Todos por título"
 
 #: /home/kovid/work/calibre/src/calibre/devices/prs505/driver.py:62
 msgid "All by author"
-msgstr ""
+msgstr "Todos por autor"
 
 #: /home/kovid/work/calibre/src/calibre/devices/prs505/driver.py:64
 msgid ""
@@ -1198,6 +1210,9 @@ msgid ""
 "to the list to enable them. The collections will be given the name provided "
 "after the \":\" character."
 msgstr ""
+". Duas coleções especiais estão disponíveis: %s:%s e %s:%s. Adicione estes "
+"valores à lista para habilita-los. As coleções receberão o nome após os dois "
+"pontos (\":\")."
 
 #: /home/kovid/work/calibre/src/calibre/devices/prs505/sony_cache.py:190
 #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/transforms/structure.py:68
@@ -1218,7 +1233,7 @@ msgstr "Comunica-se com o leitor Newsmy."
 
 #: /home/kovid/work/calibre/src/calibre/devices/teclast/driver.py:47
 msgid "Communicate with the Pico reader."
-msgstr ""
+msgstr "Comunicar-se com o Pico reader."
 
 #: /home/kovid/work/calibre/src/calibre/devices/teclast/driver.py:57
 msgid "Communicate with the iPapyrus reader."
@@ -1230,7 +1245,7 @@ msgstr "Comunicar com o leitor Sovos."
 
 #: /home/kovid/work/calibre/src/calibre/devices/teclast/driver.py:78
 msgid "Communicate with the Sunstech EB700 reader."
-msgstr ""
+msgstr "Comunicar-se com o Sunstech EB700 reader."
 
 #: /home/kovid/work/calibre/src/calibre/devices/usbms/device.py:258
 msgid "Unable to detect the %s disk drive. Try rebooting."
@@ -1254,6 +1269,8 @@ msgid ""
 "Unable to detect the %s disk drive. Either the device has already been "
 "ejected, or your kernel is exporting a deprecated version of SYSFS."
 msgstr ""
+"Não foi possível detectar o disco %s. O dispositivo já foi ejetado, ou o seu "
+"kernel está exportando uma versão deprecada do SYSFS."
 
 #: /home/kovid/work/calibre/src/calibre/devices/usbms/device.py:617
 msgid "Unable to mount main memory (Error code: %d)"
@@ -1264,6 +1281,8 @@ msgid ""
 "The main memory of %s is read only. This usually happens because of file "
 "system errors."
 msgstr ""
+"A memória principal de %s é somente leitura. Isto normalmente acontece "
+"devido a erros no sistema de arquivos."
 
 #: /home/kovid/work/calibre/src/calibre/devices/usbms/device.py:816
 #: /home/kovid/work/calibre/src/calibre/devices/usbms/device.py:818
@@ -1693,6 +1712,9 @@ msgid ""
 "is: %default. Links are only added to the TOC if less than the threshold "
 "number of chapters were detected."
 msgstr ""
+"Número máximo de links para inserir no sumário. Use 0 para desabilitar. O "
+"padrão é: %default. Links serão adicionados ao sumário somente se o número "
+"encontrado for menor que o limite máximo de capítulos."
 
 #: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:256
 msgid ""
@@ -2076,7 +2098,7 @@ msgstr "Você deve especificar um arquivo do tipo epub"
 
 #: /home/kovid/work/calibre/src/calibre/ebooks/epub/fix/unmanifested.py:17
 msgid "Fix unmanifested files"
-msgstr ""
+msgstr "Conserte arquivos sem manifesto."
 
 #: /home/kovid/work/calibre/src/calibre/ebooks/epub/fix/unmanifested.py:21
 msgid ""

From 868fa550ee4c420620be9fdf4ab13269eae742a4 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 7 Jan 2011 10:58:20 -0700
Subject: [PATCH 27/34] When parsing XML if the XML starts with a UTF-8 BOM
 decode as UTF-8. FB2 Input: Handle entities

---
 src/calibre/ebooks/chardet/__init__.py | 8 +++++---
 src/calibre/ebooks/fb2/input.py        | 6 +++++-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/chardet/__init__.py b/src/calibre/ebooks/chardet/__init__.py
index dd279c6559..f9bca3c8d4 100644
--- a/src/calibre/ebooks/chardet/__init__.py
+++ b/src/calibre/ebooks/chardet/__init__.py
@@ -18,7 +18,7 @@
 
 __version__ = "1.0"
 
-import re
+import re, codecs
 
 def detect(aBuf):
     import calibre.ebooks.chardet.universaldetector as universaldetector
@@ -83,9 +83,11 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
     if not raw:
         return u'', encoding
     if not isinstance(raw, unicode):
-        if raw.startswith('\xff\xfe'):
+        if raw.startswith(codecs.BOM_UTF8):
+            raw, encoding = raw.decode('utf-8')[1:], 'utf-8'
+        elif raw.startswith(codecs.BOM_UTF16_LE):
             raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
-        elif raw.startswith('\xfe\xff'):
+        elif raw.startswith(codecs.BOM_UTF16_BE):
             raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
     if not isinstance(raw, unicode):
         for pat in ENCODING_PATS:
diff --git a/src/calibre/ebooks/fb2/input.py b/src/calibre/ebooks/fb2/input.py
index 1f9a3ffe95..b019873d39 100644
--- a/src/calibre/ebooks/fb2/input.py
+++ b/src/calibre/ebooks/fb2/input.py
@@ -46,15 +46,19 @@ class FB2Input(InputFormatPlugin):
         log.debug('Parsing XML...')
         raw = stream.read().replace('\0', '')
         raw = xml_to_unicode(raw, strip_encoding_pats=True,
-            assume_utf8=True)[0]
+            assume_utf8=True, resolve_entities=True)[0]
         try:
             doc = etree.fromstring(raw)
         except etree.XMLSyntaxError:
             try:
                 doc = etree.fromstring(raw, parser=RECOVER_PARSER)
+                if doc is None:
+                    raise Exception('parse failed')
             except:
                 doc = etree.fromstring(raw.replace('& ', '&amp;'),
                         parser=RECOVER_PARSER)
+        if doc is None:
+            raise ValueError('The FB2 file is not valid XML')
         stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
         css = ''
         for s in stylesheets:

From 5774429f78e42a0ad38ac56b3a1b031d0b3ef18f Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 7 Jan 2011 11:33:55 -0700
Subject: [PATCH 28/34] Fix #8225 (Ampersands and Saved Searches)

---
 src/calibre/library/server/opds.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calibre/library/server/opds.py b/src/calibre/library/server/opds.py
index ab0853add9..ead7cf1938 100644
--- a/src/calibre/library/server/opds.py
+++ b/src/calibre/library/server/opds.py
@@ -128,9 +128,9 @@ def CATALOG_ENTRY(item, item_kind, base_href, version, updated,
         count = ''
     if item.category == 'authors' and \
             tweaks['categories_use_field_for_author_name'] == 'author_sort':
-        name = xml(item.sort)
+        name = item.sort
     else:
-        name = xml(item.name)
+        name = item.name
     return E.entry(
             TITLE(name + ('' if not add_kind else ' (%s)'%item_kind)),
             ID(id_),

From 3c289cdf097e0a6ed0c73098fc9d0139eab1f130 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 7 Jan 2011 12:09:49 -0700
Subject: [PATCH 29/34] Fix regression that broke conversion of PNG images in
 PDf files on OS X. Fixes #8215 (Images not extracted from PDF during
 conversion)


From 65d790ad5968a7e008c93e1483aadcbc8fc6b19d Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 7 Jan 2011 12:16:07 -0700
Subject: [PATCH 30/34] Add another motorola droid

---
 src/calibre/devices/android/driver.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py
index c2db8ddd77..b7e2f0fd2e 100644
--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@@ -27,8 +27,9 @@ class ANDROID(USBMS):
             0x040d : { 0x8510 : [0x0001], 0x0851 : [0x1] },
 
             # Motorola
-            0x22b8 : { 0x41d9 : [0x216], 0x2d61: [0x100], 0x2d67 : [0x100],
-                0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216] },
+            0x22b8 : { 0x41d9 : [0x216], 0x2d61 : [0x100], 0x2d67 : [0x100],
+                       0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216],
+                       0x4286 : [0x216] },
 
             # Sony Ericsson
             0xfce : { 0xd12e : [0x0100]},

From 5d6e4c8931e2bf68c3a84407e2bf74a7c1dc92f1 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 7 Jan 2011 12:17:26 -0700
Subject: [PATCH 31/34] Fix regression causing the template formatter to
 intepret a missing format letter as ERROR instead of 's'.

---
 src/calibre/utils/formatter.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/calibre/utils/formatter.py b/src/calibre/utils/formatter.py
index 23763a25bf..46b52b9ce5 100644
--- a/src/calibre/utils/formatter.py
+++ b/src/calibre/utils/formatter.py
@@ -316,8 +316,6 @@ class TemplateFormatter(string.Formatter):
             except:
                 raise ValueError(
                     _('format: type {0} requires a decimal (float) value, got {1}').format(typ, val))
-        else:
-            raise ValueError(_('format: unknown format type letter {0}').format(typ))
         return unicode(('{0:'+fmt+'}').format(val))
 
     def _explode_format_string(self, fmt):

From 65ddab2074835f9639a573617723dc623fd48418 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 7 Jan 2011 12:23:18 -0700
Subject: [PATCH 32/34] Template language: Add a list_item function for use
 with tags like columns. See User Manual for details

---
 src/calibre/manual/template_lang.rst |  1 +
 src/calibre/utils/formatter.py       | 15 +++++++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/calibre/manual/template_lang.rst b/src/calibre/manual/template_lang.rst
index 1bf08c11f9..f64a413d3e 100644
--- a/src/calibre/manual/template_lang.rst
+++ b/src/calibre/manual/template_lang.rst
@@ -121,6 +121,7 @@ The functions available are:
     * ``contains(pattern, text if match, text if not match`` -- checks if field contains matches for the regular expression `pattern`. Returns `text if match` if matches are found, otherwise it returns `text if no match`.
     * ``count(separator)`` -- interprets the value as a list of items separated by `separator`, returning the number of items in the list. Most lists use a comma as the separator, but authors uses an ampersand. Examples: `{tags:count(,)}`, `{authors:count(&)}`
     * ``ifempty(text)``	-- if the field is not empty, return the value of the field. Otherwise return `text`.
+    * ``list_item(index, separator)`` -- interpret the value as a list of items separated by `separator`, returning the `index`th item. The first item is number zero. The last item can be returned using `list_item(-1,separator)`. If the item is not in the list, then the empty value is returned. The separator has the same meaning as in the `count` function.
     * ``lookup(pattern, field, pattern, field, ..., else_field)`` -- like switch, except the arguments are field (metadata) names, not text. The value of the appropriate field will be fetched and used. Note that because composite columns are fields, you can use this function in one composite field to use the value of some other composite field. This is extremely useful when constructing variable save paths (more later).
     * ``re(pattern, replacement)`` -- return the field after applying the regular expression. All instances of `pattern` are replaced with `replacement`. As in all of |app|, these are python-compatible regular expressions.
     * ``shorten(left chars, middle text, right chars)`` -- Return a shortened version of the field, consisting of `left chars` characters from the beginning of the field, followed by `middle text`, followed by `right chars` characters from the end of the string. `Left chars` and `right chars` must be integers. For example, assume the title of the book is `Ancient English Laws in the Times of Ivanhoe`, and you want it to fit in a space of at most 15 characters. If you use ``{title:shorten(9,-,5)}``, the result will be `Ancient E-nhoe`. If the field's length is less than ``left chars`` + ``right chars`` + the length of ``middle text``, then the field will be used intact. For example, the title `The Dome` would not be changed.
diff --git a/src/calibre/utils/formatter.py b/src/calibre/utils/formatter.py
index 46b52b9ce5..2e4f843c3d 100644
--- a/src/calibre/utils/formatter.py
+++ b/src/calibre/utils/formatter.py
@@ -281,19 +281,30 @@ class TemplateFormatter(string.Formatter):
     def _count(self, val, sep):
         return unicode(len(val.split(sep)))
 
+    def _list_item(self, val, index, sep):
+        if not val:
+            return ''
+        index = int(index)
+        val = val.split(sep)
+        try:
+            return val[index]
+        except:
+            return ''
+
     functions = {
                     'uppercase'     : (0, lambda s,x: x.upper()),
                     'lowercase'     : (0, lambda s,x: x.lower()),
                     'titlecase'     : (0, lambda s,x: titlecase(x)),
                     'capitalize'    : (0, lambda s,x: capitalize(x)),
                     'contains'      : (3, _contains),
+                    'count'         : (1, _count),
                     'ifempty'       : (1, _ifempty),
+                    'list_item'     : (2, _list_item),
                     'lookup'        : (-1, _lookup),
                     're'            : (2, _re),
                     'shorten'       : (3, _shorten),
                     'switch'        : (-1, _switch),
-                    'test'          : (2, _test),
-                    'count'         : (1, _count),
+                    'test'          : (2, _test)
         }
 
     def _do_format(self, val, fmt):

From 19bf985c144832f96a65e9383680800b2d498774 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 7 Jan 2011 13:05:16 -0700
Subject: [PATCH 33/34] Don't uneccessarily scroll the bok list horizontally

---
 src/calibre/gui2/library/views.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/gui2/library/views.py b/src/calibre/gui2/library/views.py
index c1dd5b3766..e1e9cf4456 100644
--- a/src/calibre/gui2/library/views.py
+++ b/src/calibre/gui2/library/views.py
@@ -612,7 +612,7 @@ class BooksView(QTableView): # {{{
         if row > -1:
             h = self.horizontalHeader()
             for i in range(h.count()):
-                if not h.isSectionHidden(i):
+                if not h.isSectionHidden(i) and h.sectionViewportPosition(i) >= 0:
                     self.scrollTo(self.model().index(row, i))
                     break
 

From 83e116d59b0e355c0c77c0943e28e0b700430e2d Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 7 Jan 2011 13:10:18 -0700
Subject: [PATCH 34/34] version 0.7.38

---
 Changelog.yaml           | 94 ++++++++++++++++++++++++++++++++++++++++
 src/calibre/constants.py |  2 +-
 2 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/Changelog.yaml b/Changelog.yaml
index 699aa3a531..82b335bbdd 100644
--- a/Changelog.yaml
+++ b/Changelog.yaml
@@ -4,6 +4,100 @@
 # for important features/bug fixes.
 # Also, each release can have new and improved recipes.
 
+- version: 0.7.38
+  date: 2011-01-07
+
+  new features:
+    - title: "Reduce startup time when using a composite custom column"
+
+    - title: "Template language: Add a list_item function for use with tags like columns. See User Manual for details"
+
+    - title: "TXT Input: Attempt to detect the input encoding when not specified. Auto detect paragraph structure and formatting markup."
+
+    - title: "Search & replace: Add ability to manipulate number and boolean columns."
+
+    - title: "Add type ahead completion to the advanced search dialog."
+      tickets: [8035]
+
+    - title: "Double click on plugin in Preferences dialog to customize"
+      tickets: [8175]
+
+    - title: "Allow customization of the SONY driver to send thumbnail to the device. Useful with newer SONY readers"
+      tickets: [8161]
+
+    - title: "Smarten punctuation: Convert double dashes to em dashes. Preprocessing: Various tweaks"
+
+  bug fixes:
+    - title: "Fix regression causing the template formatter to intepret a missing format letter as ERROR instead of 's'."
+
+    - title: "Fix regression that broke conversion of PNG images in PDF files on OS X."
+      tickets: [8215]
+
+    - title: "Content server: Fix improper XML escaping of category titles in the OPDS feeds"
+      tickets: [8225]
+
+    - title: "When decoding XML if the XML starts with a UTF-8 BOM decode as UTF-8. Fixes parsing of FB2 files with UTF-8 BOMs"
+
+    - title: "E-book viewer: When scrolling to a bookmark and the content is wider than the window, do not scroll in the horizontal direction"
+
+    - title: "E-book viewer: Fix next page skipping the bottom of chapters when the content is wider than the window."
+      tickets: [8153]
+
+    - title: " FB2 Output: Insert covers."
+      tickets: [8172]
+
+    - title: "Content server: When serving OPDS feeds handle html descriptions that have namespaced attributes."
+      tickets: [7938]
+
+    - title: "When downloading metadata from isbndb.com, download a maximum of 30 results rather than 1000"
+
+    - title: "Fix sorting of tags column"
+
+    - title: "Change search/replace to show commas instead of vertical bars as the separator for multiple authors"
+
+    - title: "Template language: Make all column names case insensitive"
+
+    - title: "Fix bug that prevent the Disabled option for Tag Browser partiotining from working in the Preferences dialog"
+
+    - title: "Fix bug when using tags like custom column in the template language"
+
+    - title: "Fix bug where composite custom columns using general_program_mode fields are not evaluated correctly when used in a template."
+
+    - title: "ImageMagick interface: Don't crash when asked to open empty image files"
+
+    - title: "Kobo driver: Add TXT,CBZ,CBR to supported formats list"
+      tickets: [8124]
+
+    - title: "Don't uneccessarily scroll the book list horizontally when re-selcting previously selected rows."
+
+  new recipes:
+    - title: "New London Day"
+      author: "Being"
+      
+    - title: "Walla"
+      author: "marbs"
+
+    - title: "New Journal of Physics"
+      author: "Chema Cortes"
+
+    - title: "The Baltimore Sun"
+      author: "Josh Hall"
+
+    - title: "Arabian Business and Sunday Times (UK)"
+      author: "Darko Miletic"
+
+    - title: "Deia"
+      author: "Gerardo Diez"
+
+    - title: "Smarter Planet"
+      author: "Jack Mason"
+
+
+  improved recipes:
+    - The Atlantic
+    - Danas
+    - Ledevoir
+
 - version: 0.7.37
   date: 2011-01-02
 
diff --git a/src/calibre/constants.py b/src/calibre/constants.py
index bc359a2b79..2443c55d9d 100644
--- a/src/calibre/constants.py
+++ b/src/calibre/constants.py
@@ -2,7 +2,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 __appname__   = 'calibre'
-__version__   = '0.7.37'
+__version__   = '0.7.38'
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"
 
 import re