From d335bccd67da45f2fd6b69b81e7e8d6db89fa378 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 20 Sep 2010 00:31:22 +0800
Subject: [PATCH 01/32] ...

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 6a5eaa4a34..f38d02309a 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -83,7 +83,7 @@ class PreProcessor(object):
         # <pre> tags), check and  mark up line endings if required before proceeding
         if self.no_markup(html, 0.1):
              self.log("not enough paragraph markers, adding now")
-             # check if content is in pre tags, use txt procesor to mark up if so
+             # check if content is in pre tags, use txt processor to mark up if so
              pre = re.compile(r'<pre>', re.IGNORECASE)
              if len(pre.findall(html)) == 1:
                  self.log("Running Text Processing")

From 301af532c6940ec8082dbe6ece4dca351417ac63 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 20 Sep 2010 09:57:46 +0800
Subject: [PATCH 02/32] made em-dash unwrapping line length dependent, as
 sometimes it's used as an ellipsis alternative

---
 src/calibre/ebooks/conversion/preprocess.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 3b1239814a..d6b5460552 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -117,7 +117,7 @@ class Dehyphenator(object):
     def __init__(self):
         # Add common suffixes to the regex below to increase the likelihood of a match -
         # don't add suffixes which are also complete words, such as 'able' or 'sex'
-        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
         # remove prefixes if the prefix was not already the point of hyphenation
         self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
         self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
@@ -374,10 +374,8 @@ class HTMLPreProcessor(object):
                 print 'Failed to parse remove_footer regexp'
                 traceback.print_exc()
 
-        # unwrap em/en dashes, delete soft hyphens - moved here so it's executed after header/footer removal
+        # delete soft hyphens - moved here so it's executed after header/footer removal
         if is_pdftohtml:
-            # unwrap em/en dashes
-            end_rules.append((re.compile(u'(?<=[–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
             # unwrap/delete soft hyphens
             end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
             # unwrap/delete soft hyphens with formatting
@@ -397,6 +395,8 @@ class HTMLPreProcessor(object):
                     # Un wrap using punctuation
                     (re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                 )
+                # unwrap em/en dashes
+                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
 
         for rule in self.PREPROCESS + start_rules:
             html = rule[0].sub(rule[1], html)

From 936451853caa1190eff41bf07a28f39005da5fb3 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 21 Sep 2010 18:18:50 -1000
Subject: [PATCH 03/32] tuned dehyphen code to better handle unwrapped docs,
 added line histogram function to determine whether a document has hard breaks
 or not

---
 src/calibre/ebooks/conversion/preprocess.py | 138 +++++++++++++++-----
 src/calibre/ebooks/conversion/utils.py      |  48 ++++---
 2 files changed, 134 insertions(+), 52 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index d6b5460552..c42b29e0e4 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -62,49 +62,97 @@ def wrap_lines(match):
     else:
                return ital+' '
 
-def line_length(format, raw, percent):
+def line_length(format, raw, percent, test_type):
     '''
-    raw is the raw text to find the line length to use for wrapping.
+    Analyses the document to see if hard line breaks exist or to find the 
+    median line length.
+    format is the type of document analysis will be done against.
+    raw is the raw text to determine the line length to use for wrapping.
     percentage is a decimal number, 0 - 1 which is used to determine
     how far in the list of line lengths to use. The list of line lengths is
     ordered smallest to larged and does not include duplicates. 0.5 is the
     median value.
+    test_type sets whether to use the line length to return the median or a
+    do a histogram analysis to see if unwrapping is required.
     '''
     raw = raw.replace('&nbsp;', ' ')
     if format == 'html':
-        linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
+        linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
     elif format == 'pdf':
         linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
     elif format == 'spanned_html':
         linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
     lines = linere.findall(raw)
 
-    lengths = []
-    for line in lines:
-        if len(line) > 0:
-            lengths.append(len(line))
+    if test_type == 'median':
+        lengths = []
+        for line in lines:
+            if len(line) > 0:
+                lengths.append(len(line))
 
-    if not lengths:
-        return 0
+        if not lengths:
+            return 0
 
-    lengths = list(set(lengths))
-    total = sum(lengths)
-    avg = total / len(lengths)
-    max_line = avg * 2
+        lengths = list(set(lengths))
+        total = sum(lengths)
+        avg = total / len(lengths)
+        max_line = avg * 2
 
-    lengths = sorted(lengths)
-    for i in range(len(lengths) - 1, -1, -1):
-        if lengths[i] > max_line:
-            del lengths[i]
+        lengths = sorted(lengths)
+        for i in range(len(lengths) - 1, -1, -1):
+            if lengths[i] > max_line:
+                del lengths[i]
 
-    if percent > 1:
-        percent = 1
-    if percent < 0:
-        percent = 0
+        if percent > 1:
+            percent = 1
+        if percent < 0:
+            percent = 0
 
-    index = int(len(lengths) * percent) - 1
+        index = int(len(lengths) * percent) - 1
 
-    return lengths[index]
+        return lengths[index]
+
+    if test_type == 'histogram':
+        minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
+        maxLineLength=1900 # Discard larger than this to stay in range
+        buckets=20 # Each line is divided into a bucket based on length
+
+        #print "there are "+str(len(lines))+" lines"
+        max = 0
+        for line in lines:
+            l = len(line)
+            if l > max:
+                max = l
+        print "max line found is "+str(max)
+        # Build the line length histogram
+        hRaw = [ 0 for i in range(0,buckets) ]
+        for line in lines:
+            l = len(line)
+            if l > minLineLength and l < maxLineLength:
+                    l = int(l/100)
+                    #print "adding "+str(l)
+                    hRaw[l]+=1
+
+        # Normalize the histogram into percents
+        totalLines = len(lines)
+        h = [ float(count)/totalLines for count in hRaw ]
+        print "\nhRaw histogram lengths are: "+str(hRaw)
+        print "              percents are: "+str(h)+"\n"
+        
+        # Find the biggest bucket
+        maxValue = 0
+        peakPosition = 0
+        for i in range(0,len(h)):
+            if h[i] > maxValue:
+                maxValue = h[i]
+                peakPosition = i
+
+        if maxValue < percent:
+            #print "Line lengths are too variable. Not unwrapping."
+            return False
+        else:
+            #print str(maxValue)+" of the lines were in one bucket"
+            return True
 
 class Dehyphenator(object):
     '''
@@ -117,7 +165,7 @@ class Dehyphenator(object):
     def __init__(self):
         # Add common suffixes to the regex below to increase the likelihood of a match -
         # don't add suffixes which are also complete words, such as 'able' or 'sex'
-        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
         # remove prefixes if the prefix was not already the point of hyphenation
         self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
         self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
@@ -125,34 +173,54 @@ class Dehyphenator(object):
     def dehyphenate(self, match):
         firsthalf = match.group('firstpart')
         secondhalf = match.group('secondpart')
+        try:
+            wraptags = match.group('wraptags')
+        except:
+            wraptags = ''
         hyphenated = str(firsthalf) + "-" + str(secondhalf)
         dehyphenated = str(firsthalf) + str(secondhalf)
         lookupword = self.removesuffixes.sub('', dehyphenated)
         if self.prefixes.match(firsthalf) is None:
            lookupword = self.removeprefix.sub('', lookupword)
         booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
-        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
-        match = booklookup.search(self.html)
-        if match:
-            #print "returned dehyphenated word: " + str(dehyphenated)
-            return dehyphenated
+        print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+        if self.format == 'html_cleanup':
+           match = booklookup.search(self.html)
+           hyphenmatch = re.search(u'%s' % hyphenated, self.html)
+           if match:
+               print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+               return dehyphenated
+           elif hyphenmatch:
+               print "Cleanup:returned hyphenated word: " + str(hyphenated)
+               return hyphenated
+           else:
+               print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+               return firsthalf+u'\u2014'+wraptags+secondhalf
+               
         else:
-            #print "returned hyphenated word: " + str(hyphenated)
-            return hyphenated
+            match = booklookup.search(self.html)
+            if match:
+                print "returned dehyphenated word: " + str(dehyphenated)
+                return dehyphenated
+            else:
+                print "returned hyphenated word: " + str(hyphenated)
+                return hyphenated
 
     def __call__(self, html, format, length=1):
         self.html = html
+        self.format = format
         if format == 'html':
-            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
         elif format == 'pdf':
-            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(<p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
         elif format == 'individual_words':
             intextmatch = re.compile('>[^<]*\b(?P<firstpart>[^"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
+        elif format == 'html_cleanup':
+            intextmatch = re.compile(u'(?P<firstpart>[^“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
 
         html = intextmatch.sub(self.dehyphenate, html)
         return html
 
-
 class CSSPreProcessor(object):
 
     PAGE_PAT   = re.compile(r'@page[^{]*?{[^}]*?}')
@@ -388,7 +456,7 @@ class HTMLPreProcessor(object):
                 end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
 
         if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
-            length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
+            length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'), 'median')
             if length:
                 # print "The pdf line length returned is " + str(length)
                 end_rules.append(
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index f38d02309a..7e85e24a83 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -153,7 +153,6 @@ class PreProcessor(object):
 
         ###### Unwrap lines ######
         #
-        self.log("Unwrapping Lines")
         # Some OCR sourced files have line breaks in the html using a combination of span & p tags
         # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
         # that lines can be un-wrapped across page boundaries
@@ -168,25 +167,40 @@ class PreProcessor(object):
                 format = 'html'
         else:
             format = 'html'
-
+        # Check Line histogram to determine if the document uses hard line breaks, If 50% or 
+        # more of the lines break in the same region of the document then unwrapping is required
+        hardbreaks = line_length(format, html, .50, 'histogram')
+        print "Hard line breaks check returned "+str(hardbreaks)
         # Calculate Length
-        length = line_length(format, html, getattr(self.extra_opts,
-            'html_unwrap_factor', 0.4))
+        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
+        length = line_length(format, html, unwrap_factor, 'median')
         self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
-        max_length = length * 1.4
-        min_max = str("(?<=.{"+str(length)+"})(?<!.{"+str(max_length)+"})")
-        #
-        # Unwrap em/en dashes, delete soft-hyphens
-        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
+        # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
+        if hardbreaks or unwrap_factor < 0.4:
+            self.log("Unwrapping required, unwrapping Lines")
+            # Unwrap em/en dashes
+            #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
+            html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
+            # Dehyphenate
+            self.log("Unwrapping/Removing hyphens")
+            dehyphenator = Dehyphenator()
+            html = dehyphenator(html,'html', length)
+            self.log("Done dehyphenating")
+            # Unwrap lines using punctation and line length
+            unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+            html = unwrap.sub(' ', html)
+            #check any remaining hyphens, but only unwrap if there is a match
+            dehyphenator = Dehyphenator()
+            html = dehyphenator(html,'html_cleanup', length)
+        else:
+            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
+            self.log("Cleaning up hyphenation")
+            dehyphenator = Dehyphenator()
+            html = dehyphenator(html,'html_cleanup', length)
+            self.log("Done dehyphenating")
+            
+        # delete soft hyphens
         html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
-        html = re.sub(u'%s(?<=[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % min_max, '', html)
-        # Dehyphenate
-        dehyphenator = Dehyphenator()
-        html = dehyphenator(html,'html', length)
-
-        # Unwrap lines using punctation and line length
-        unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
-        html = unwrap.sub(' ', html)
 
         # If still no sections after unwrapping mark split points on lines with no punctuation
         if self.html_preprocess_sections < 10:

From 5aa36581c57e80a791071aaf9fcddb7fd4e4eaff Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 25 Sep 2010 12:34:01 -1000
Subject: [PATCH 04/32] Building chapter marking regexes using variables to
 increase manageability, switched to using backreferences to increase
 reliability

---
 src/calibre/ebooks/conversion/utils.py | 51 ++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 7 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 7e85e24a83..5e3cac7714 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -113,11 +113,12 @@ class PreProcessor(object):
         # Get rid of empty <o:p> tags to simplify other processing
         html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
         # Get rid of empty span, bold, & italics tags
-        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
+        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
         html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
 
-        # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+        # If more than 40% of the lines are empty paragraphs and the user has enabled remove
+        # paragraph spacing then delete blank lines to clean up spacing
         linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
         blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         #multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
@@ -129,26 +130,63 @@ class PreProcessor(object):
             'remove_paragraph_spacing', False):
                 self.log("deleting blank lines")
                 html = blankreg.sub('', html)
+            elif float(len(blanklines)) / float(len(lines)) > 0.40:
+               blanks_between_paragraphs = True
+               print "blanks between paragraphs is marked True"
+            else:
+                blanks_between_paragraphs = False
         # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
         html = re.sub(r"\s*</p>", "</p>\n", html)
         html = re.sub(r"\s*<p>\s*", "\n<p>", html)
         # detect chapters/sections to match xpath or splitting logic
+        #
+        # Build the Regular Expressions in pieces
+        lookahead = "(?=<(p|div))"
+        chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>span|[ibu])[^>]*>)?\s*(<(?P<inner2>span|[ibu])[^>]*>)?\s*(<(?P<inner3>span|[ibu])[^>]*>)?\s*"
+        chapter_header_open = r"(?P<chap>"
+        chapter_header_close = ")\s*"
+        chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)\s[^>]*>)?\s*</(?P=outer)>\s*"
+        if blanks_between_paragraphs:
+            blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
+        else:
+            blank_lines = ""
+        opt_title_open = "("
+        title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>span|[ibu])[^>]*>)?\s*(<(?P<inner5>span|[ibu])[^>]*>)?\s*(<(?P<inner6>span|[ibu])[^>]*>)?\s*"
+        title_header_open = "(?P<title>"
+        title_header_close = ")\s*"
+        title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"
+        opt_title_close = ")?"
+        
+        default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
+        typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
+        numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
+        uppercase_chapters = r"\s*.?([A-Z#\-\s]+)\s*"
+        
+        chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+        print chapter_marker
+        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")       
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
         self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
         #
         # Start with most typical chapter headings, get more aggressive until one works
         if self.html_preprocess_sections < 10:
-            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(p|/?br)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE)
+            chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+            #chapdetect = re.compile(r"(?=<(p|div))<(?P<outer>p|div)[^>]*>\s*(<(?P<inner_one>span|[ibu])\s[^>]*>)?\s*(<(?P<inner_two>span|[ibu])\s[^>]*>)?\s*(<(?P<inner_three>span|[ibu])\s[^>]*>)?\s*(?P<chap>.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8})\s*(</(?P=<inner_three>)>)?\s*(</(?P=<inner_two>)>)?\s*(</(?P=<inner_one>)\s[^>]*>)?\s</(?P=<outer>)>(<(?P<outer_two>p|div)[^>]*>\s*(<(?P<inner_four>span|[ibu])\s[^>]*>)?\s*(<(?P<inner_five>span|[ibu])\s[^>]*>)?\s*(<(?P<inner_six>span|[ibu])\s[^>]*>)?\s*(?P<title>(\s*[\w\'\"-]+){1,5})\s*(</(?P=<inner_six>)>)?\s*(</(?P=<inner_five>)>)?\s*(</(?P=<inner_four>)\s[^>]*>)?\s</(?P=<outer_two>)>)?", re.IGNORECASE)
+            #chapdetect = re.compile(r'(?=</?(br|p))(<(?P<outer>(/?br|p))[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(?P=outer)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE)
             html = chapdetect.sub(self.chapter_head, html)
         if self.html_preprocess_sections < 10:
             self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+            chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+            #chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
             html = chapdetect2.sub(self.chapter_head, html)
 
         if self.html_preprocess_sections < 10:
             self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+            chapdetect2 = re.compile(r'%s' % chapter_marker,  re.UNICODE)
+            #chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
             html = chapdetect2.sub(self.chapter_head, html)
 
         ###### Unwrap lines ######
@@ -179,7 +217,6 @@ class PreProcessor(object):
         if hardbreaks or unwrap_factor < 0.4:
             self.log("Unwrapping required, unwrapping Lines")
             # Unwrap em/en dashes
-            #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
             html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
             # Dehyphenate
             self.log("Unwrapping/Removing hyphens")
@@ -206,7 +243,7 @@ class PreProcessor(object):
         if self.html_preprocess_sections < 10:
             self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
             chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
-            html = chapdetect3.sub(self.chapter_break, html)
+            #html = chapdetect3.sub(self.chapter_break, html)
         # search for places where a first or second level heading is immediately followed by another
         # top level heading.  demote the second heading to h3 to prevent splitting between chapter
         # headings and titles, images, etc

From 8b7ef0984f4bed6acc64b6e6124352c65b22eb65 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 25 Sep 2010 12:53:40 -1000
Subject: [PATCH 05/32] ...

---
 src/calibre/ebooks/conversion/preprocess.py | 18 +++++++++---------
 src/calibre/ebooks/conversion/utils.py      | 11 ++++-------
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index c42b29e0e4..a18ff07d44 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -123,7 +123,7 @@ def line_length(format, raw, percent, test_type):
             l = len(line)
             if l > max:
                 max = l
-        print "max line found is "+str(max)
+        #print "max line found is "+str(max)
         # Build the line length histogram
         hRaw = [ 0 for i in range(0,buckets) ]
         for line in lines:
@@ -136,8 +136,8 @@ def line_length(format, raw, percent, test_type):
         # Normalize the histogram into percents
         totalLines = len(lines)
         h = [ float(count)/totalLines for count in hRaw ]
-        print "\nhRaw histogram lengths are: "+str(hRaw)
-        print "              percents are: "+str(h)+"\n"
+        #print "\nhRaw histogram lengths are: "+str(hRaw)
+        #print "              percents are: "+str(h)+"\n"
         
         # Find the biggest bucket
         maxValue = 0
@@ -183,27 +183,27 @@ class Dehyphenator(object):
         if self.prefixes.match(firsthalf) is None:
            lookupword = self.removeprefix.sub('', lookupword)
         booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
-        print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
         if self.format == 'html_cleanup':
            match = booklookup.search(self.html)
            hyphenmatch = re.search(u'%s' % hyphenated, self.html)
            if match:
-               print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+               #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
                return dehyphenated
            elif hyphenmatch:
-               print "Cleanup:returned hyphenated word: " + str(hyphenated)
+               #print "Cleanup:returned hyphenated word: " + str(hyphenated)
                return hyphenated
            else:
-               print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+               #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
                return firsthalf+u'\u2014'+wraptags+secondhalf
                
         else:
             match = booklookup.search(self.html)
             if match:
-                print "returned dehyphenated word: " + str(dehyphenated)
+                #print "returned dehyphenated word: " + str(dehyphenated)
                 return dehyphenated
             else:
-                print "returned hyphenated word: " + str(hyphenated)
+                #print "returned hyphenated word: " + str(hyphenated)
                 return hyphenated
 
     def __call__(self, html, format, length=1):
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 5e3cac7714..555f42702b 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -132,7 +132,7 @@ class PreProcessor(object):
                 html = blankreg.sub('', html)
             elif float(len(blanklines)) / float(len(lines)) > 0.40:
                blanks_between_paragraphs = True
-               print "blanks between paragraphs is marked True"
+               #print "blanks between paragraphs is marked True"
             else:
                 blanks_between_paragraphs = False
         # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
@@ -163,7 +163,7 @@ class PreProcessor(object):
         uppercase_chapters = r"\s*.?([A-Z#\-\s]+)\s*"
         
         chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-        print chapter_marker
+        #print chapter_marker
         #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")       
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
@@ -172,14 +172,11 @@ class PreProcessor(object):
         # Start with most typical chapter headings, get more aggressive until one works
         if self.html_preprocess_sections < 10:
             chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
-            #chapdetect = re.compile(r"(?=<(p|div))<(?P<outer>p|div)[^>]*>\s*(<(?P<inner_one>span|[ibu])\s[^>]*>)?\s*(<(?P<inner_two>span|[ibu])\s[^>]*>)?\s*(<(?P<inner_three>span|[ibu])\s[^>]*>)?\s*(?P<chap>.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8})\s*(</(?P=<inner_three>)>)?\s*(</(?P=<inner_two>)>)?\s*(</(?P=<inner_one>)\s[^>]*>)?\s</(?P=<outer>)>(<(?P<outer_two>p|div)[^>]*>\s*(<(?P<inner_four>span|[ibu])\s[^>]*>)?\s*(<(?P<inner_five>span|[ibu])\s[^>]*>)?\s*(<(?P<inner_six>span|[ibu])\s[^>]*>)?\s*(?P<title>(\s*[\w\'\"-]+){1,5})\s*(</(?P=<inner_six>)>)?\s*(</(?P=<inner_five>)>)?\s*(</(?P=<inner_four>)\s[^>]*>)?\s</(?P=<outer_two>)>)?", re.IGNORECASE)
-            #chapdetect = re.compile(r'(?=</?(br|p))(<(?P<outer>(/?br|p))[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(?P=outer)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE)
             html = chapdetect.sub(self.chapter_head, html)
         if self.html_preprocess_sections < 10:
             self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
             chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
             chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
-            #chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
             html = chapdetect2.sub(self.chapter_head, html)
 
         if self.html_preprocess_sections < 10:
@@ -208,7 +205,7 @@ class PreProcessor(object):
         # Check Line histogram to determine if the document uses hard line breaks, If 50% or 
         # more of the lines break in the same region of the document then unwrapping is required
         hardbreaks = line_length(format, html, .50, 'histogram')
-        print "Hard line breaks check returned "+str(hardbreaks)
+        #print "Hard line breaks check returned "+str(hardbreaks)
         # Calculate Length
         unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
         length = line_length(format, html, unwrap_factor, 'median')
@@ -243,7 +240,7 @@ class PreProcessor(object):
         if self.html_preprocess_sections < 10:
             self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
             chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
-            #html = chapdetect3.sub(self.chapter_break, html)
+            html = chapdetect3.sub(self.chapter_break, html)
         # search for places where a first or second level heading is immediately followed by another
         # top level heading.  demote the second heading to h3 to prevent splitting between chapter
         # headings and titles, images, etc

From 394f09e7f48ae6c1b738694970f19eb1c4555aaa Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 26 Sep 2010 10:23:02 +0900
Subject: [PATCH 06/32] Escaping meta-characters before compiling words as a
 regex for removing hyphens

---
 src/calibre/customize/profiles.py           | 4 ++++
 src/calibre/ebooks/conversion/preprocess.py | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py
index 2b5eb5011e..ba0cd187e4 100644
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@@ -61,6 +61,7 @@ class SonyReaderInput(InputProfile):
     dpi                       = 168.451
     fbase                     = 12
     fsizes                    = [7.5, 9, 10, 12, 15.5, 20, 22, 24]
+    #unsupported_unicode_chars = [\u2018, \u2019, \u201a, \u201b, \u201c, \u201d, \u201e, \u201f]
 
 class SonyReader300Input(SonyReaderInput):
 
@@ -250,6 +251,9 @@ class OutputProfile(Plugin):
 
     #: The character used to represent a star in ratings
     ratings_char = u'*'
+    
+    #: Unsupported unicode characters to be replaced during preprocessing
+    unsupported_unicode_chars = []
 
     @classmethod
     def tags_to_string(cls, tags):
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 0b981cf6f7..b4815cb35e 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -182,8 +182,10 @@ class Dehyphenator(object):
         lookupword = self.removesuffixes.sub('', dehyphenated)
         if self.prefixes.match(firsthalf) is None:
            lookupword = self.removeprefix.sub('', lookupword)
-        booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
+        # escape any meta-characters which may be in the lookup word
+        lookupword = re.sub(r'(?P<meta>[\[\]\\\^\$\.\|\?\*\+\(\)])', r'\\\g<meta>', lookupword)
         #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+        booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
         if self.format == 'html_cleanup':
            match = booklookup.search(self.html)
            hyphenmatch = re.search(u'%s' % hyphenated, self.html)

From 7f37832aa8d491c4648cca9eb5e0d8a20552a9aa Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 26 Sep 2010 14:16:07 +0900
Subject: [PATCH 07/32] added framework for re-mapping unsupported unicode
 chars based on Output Profile

---
 src/calibre/customize/profiles.py           | 4 +++-
 src/calibre/ebooks/conversion/preprocess.py | 5 +++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py
index ba0cd187e4..e281179565 100644
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@@ -1,3 +1,4 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import with_statement
 __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
@@ -61,7 +62,6 @@ class SonyReaderInput(InputProfile):
     dpi                       = 168.451
     fbase                     = 12
     fsizes                    = [7.5, 9, 10, 12, 15.5, 20, 22, 24]
-    #unsupported_unicode_chars = [\u2018, \u2019, \u201a, \u201b, \u201c, \u201d, \u201e, \u201f]
 
 class SonyReader300Input(SonyReaderInput):
 
@@ -426,6 +426,8 @@ class SonyReaderOutput(OutputProfile):
     dpi                       = 168.451
     fbase                     = 12
     fsizes                    = [7.5, 9, 10, 12, 15.5, 20, 22, 24]
+    unsupported_unicode_chars = [[u'\u2018',u'‘'], [u'\u2019',u'’'], [u'\u201a',u'‘'], [u'\u201b',u'’'], [u'\u201c',u'“'], [u'\u201d',u'”'], [u'\u201e',u'“'], [u'\u201f',u'”']]
+
 
 class KoboReaderOutput(OutputProfile):
 
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index b4815cb35e..d2105a4189 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -524,6 +524,11 @@ class HTMLPreProcessor(object):
         if getattr(self.extra_opts, 'smarten_punctuation', False):
             html = self.smarten_punctuation(html)
 
+        unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
+        print str(unsupported_unicode_chars)
+        for [char, replacement] in unsupported_unicode_chars:
+            html = re.sub('%s' % char, replacement, html)
+
         return html
 
     def smarten_punctuation(self, html):

From 32f231549f922a8ba49c20736bfe3b7e27632950 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 26 Sep 2010 20:51:24 +0900
Subject: [PATCH 08/32] ...

---
 src/calibre/ebooks/conversion/preprocess.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index d2105a4189..23d073cfa4 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -525,7 +525,6 @@ class HTMLPreProcessor(object):
             html = self.smarten_punctuation(html)
 
         unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
-        print str(unsupported_unicode_chars)
         for [char, replacement] in unsupported_unicode_chars:
             html = re.sub('%s' % char, replacement, html)
 

From 105591980657bfc945e45825d33b94bb385486d6 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 27 Sep 2010 17:00:04 +0800
Subject: [PATCH 09/32] regex tweaks, including fixes for
 http://bugs.calibre-ebook.com/ticket/6976

---
 src/calibre/ebooks/conversion/preprocess.py |  6 +++---
 src/calibre/ebooks/conversion/utils.py      | 16 +++++++++-------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 23d073cfa4..264b933047 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -356,7 +356,7 @@ class HTMLPreProcessor(object):
                   (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
 
                   # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
+                  (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
                   # Cover the case where every letter in a chapter title is separated by a space
                   (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
 
@@ -461,10 +461,10 @@ class HTMLPreProcessor(object):
         if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
             length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'), 'median')
             if length:
-                # print "The pdf line length returned is " + str(length)
+                print "The pdf line length returned is " + str(length)
                 end_rules.append(
                     # Un wrap using punctuation
-                    (re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                 )
                 # unwrap em/en dashes
                 end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 555f42702b..f41f6abd08 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -77,6 +77,11 @@ class PreProcessor(object):
 
     def __call__(self, html):
         self.log("*********  Preprocessing HTML  *********")
+
+        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
+        html = re.sub(r"\s*</p>", "</p>\n", html)
+        html = re.sub(r"\s*<p>\s*", "\n<p>", html)
+        
         ###### Check Markup ######
         #
         # some lit files don't have any <p> tags or equivalent (generally just plain text between
@@ -135,9 +140,7 @@ class PreProcessor(object):
                #print "blanks between paragraphs is marked True"
             else:
                 blanks_between_paragraphs = False
-        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
-        html = re.sub(r"\s*</p>", "</p>\n", html)
-        html = re.sub(r"\s*<p>\s*", "\n<p>", html)
+        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")  
         # detect chapters/sections to match xpath or splitting logic
         #
         # Build the Regular Expressions in pieces
@@ -160,11 +163,10 @@ class PreProcessor(object):
         default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
         typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
         numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
-        uppercase_chapters = r"\s*.?([A-Z#\-\s]+)\s*"
+        uppercase_chapters = r"\s*.?([A-Z#\-]+\s{0,3}){1,5}\s*"
         
         chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-        #print chapter_marker
-        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")       
+        #print chapter_marker     
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
         self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
@@ -183,9 +185,9 @@ class PreProcessor(object):
             self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
             chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
             chapdetect2 = re.compile(r'%s' % chapter_marker,  re.UNICODE)
+            print str(chapter_marker)
             #chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
             html = chapdetect2.sub(self.chapter_head, html)
-
         ###### Unwrap lines ######
         #
         # Some OCR sourced files have line breaks in the html using a combination of span & p tags

From 668ea97895c330dc54c248f646a10b91d447a2ab Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 27 Sep 2010 17:17:21 +0800
Subject: [PATCH 10/32] Added metacharacters to the rest of the dehyphenation
 patterns

---
 src/calibre/ebooks/conversion/preprocess.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 264b933047..840eff4c12 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -212,13 +212,13 @@ class Dehyphenator(object):
         self.html = html
         self.format = format
         if format == 'html':
-            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
         elif format == 'pdf':
-            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
         elif format == 'individual_words':
-            intextmatch = re.compile('>[^<]*\b(?P<firstpart>[^"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
+            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
         elif format == 'html_cleanup':
-            intextmatch = re.compile(u'(?P<firstpart>[^“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
+            intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
 
         html = intextmatch.sub(self.dehyphenate, html)
         return html

From 2677a9296b07acb750d48a70933ee5402e3081bc Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 27 Sep 2010 17:59:31 +0800
Subject: [PATCH 11/32] String searches - avoid regex compilations entirely

---
 src/calibre/ebooks/conversion/preprocess.py | 17 ++++++-----------
 src/calibre/ebooks/conversion/utils.py      |  3 +--
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 840eff4c12..36221f486b 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -167,8 +167,8 @@ class Dehyphenator(object):
         # don't add suffixes which are also complete words, such as 'able' or 'sex'
         self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
         # remove prefixes if the prefix was not already the point of hyphenation
-        self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
-        self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
+        self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
+        self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
 
     def dehyphenate(self, match):
         firsthalf = match.group('firstpart')
@@ -182,17 +182,13 @@ class Dehyphenator(object):
         lookupword = self.removesuffixes.sub('', dehyphenated)
         if self.prefixes.match(firsthalf) is None:
            lookupword = self.removeprefix.sub('', lookupword)
-        # escape any meta-characters which may be in the lookup word
-        lookupword = re.sub(r'(?P<meta>[\[\]\\\^\$\.\|\?\*\+\(\)])', r'\\\g<meta>', lookupword)
         #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
         booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
         if self.format == 'html_cleanup':
-           match = booklookup.search(self.html)
-           hyphenmatch = re.search(u'%s' % hyphenated, self.html)
-           if match:
+           if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
                return dehyphenated
-           elif hyphenmatch:
+           elif self.html.find(hyphenated) != -1:
                #print "Cleanup:returned hyphenated word: " + str(hyphenated)
                return hyphenated
            else:
@@ -200,8 +196,7 @@ class Dehyphenator(object):
                return firsthalf+u'\u2014'+wraptags+secondhalf
                
         else:
-            match = booklookup.search(self.html)
-            if match:
+            if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
                 #print "returned dehyphenated word: " + str(dehyphenated)
                 return dehyphenated
             else:
@@ -461,7 +456,7 @@ class HTMLPreProcessor(object):
         if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
             length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'), 'median')
             if length:
-                print "The pdf line length returned is " + str(length)
+                #print "The pdf line length returned is " + str(length)
                 end_rules.append(
                     # Un wrap using punctuation
                     (re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index f41f6abd08..28c92eb7d8 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -163,7 +163,7 @@ class PreProcessor(object):
         default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
         typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
         numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
-        uppercase_chapters = r"\s*.?([A-Z#\-]+\s{0,3}){1,5}\s*"
+        uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
         
         chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
         #print chapter_marker     
@@ -185,7 +185,6 @@ class PreProcessor(object):
             self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
             chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
             chapdetect2 = re.compile(r'%s' % chapter_marker,  re.UNICODE)
-            print str(chapter_marker)
             #chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
             html = chapdetect2.sub(self.chapter_head, html)
         ###### Unwrap lines ######

From 217a1716faf8c8394b52f3467e07f537137de217 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 27 Sep 2010 18:17:35 +0800
Subject: [PATCH 12/32] copied the fixes for 6976 over to html preprocess code

---
 src/calibre/ebooks/conversion/utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 28c92eb7d8..9c57756d28 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -161,7 +161,7 @@ class PreProcessor(object):
         opt_title_close = ")?"
         
         default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
-        typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
+        typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
         numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
         uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
         
@@ -185,7 +185,6 @@ class PreProcessor(object):
             self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
             chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
             chapdetect2 = re.compile(r'%s' % chapter_marker,  re.UNICODE)
-            #chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
             html = chapdetect2.sub(self.chapter_head, html)
         ###### Unwrap lines ######
         #
@@ -222,7 +221,7 @@ class PreProcessor(object):
             html = dehyphenator(html,'html', length)
             self.log("Done dehyphenating")
             # Unwrap lines using punctation and line length
-            unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+            unwrap = re.compile(u"(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
             html = unwrap.sub(' ', html)
             #check any remaining hyphens, but only unwrap if there is a match
             dehyphenator = Dehyphenator()

From 4baab972aba634db2bcc3f97e43cd49e111a8aad Mon Sep 17 00:00:00 2001
From: Li Fanxi <lifanxi@freemindworld.com>
Date: Mon, 27 Sep 2010 18:48:55 +0800
Subject: [PATCH 13/32] Add Douban.com cover plugin

---
 src/calibre/customize/builtins.py     |  4 +-
 src/calibre/ebooks/metadata/covers.py | 64 +++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index ec9f7e2bc2..5fd51de38c 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -469,14 +469,14 @@ from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \
     LibraryThing
 from calibre.ebooks.metadata.douban import DoubanBooks
 from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
-        LibraryThingCovers
+        LibraryThingCovers, DoubanCovers
 from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX
 from calibre.ebooks.epub.fix.unmanifested import Unmanifested
 from calibre.ebooks.epub.fix.epubcheck import Epubcheck
 
 plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon,
         LibraryThing, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
-        Epubcheck, OpenLibraryCovers, LibraryThingCovers]
+        Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers]
 plugins += [
     ComicInput,
     EPUBInput,
diff --git a/src/calibre/ebooks/metadata/covers.py b/src/calibre/ebooks/metadata/covers.py
index b05444c1c6..bef19b4db7 100644
--- a/src/calibre/ebooks/metadata/covers.py
+++ b/src/calibre/ebooks/metadata/covers.py
@@ -9,6 +9,7 @@ import traceback, socket, re, sys
 from functools import partial
 from threading import Thread, Event
 from Queue import Queue, Empty
+from lxml import etree
 
 import mechanize
 
@@ -216,6 +217,69 @@ def download_covers(mi, result_queue, max_covers=50, timeout=5.): # {{{
 
 # }}}
 
+class DoubanCovers(CoverDownload): # {{{
+    'Download covers from Douban.com'
+
+    DOUBAN_ISBN_URL = 'http://api.douban.com/book/subject/isbn/'
+    CALIBRE_DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
+    name = 'Douban.com covers'
+    description = _('Download covers from Douban.com')
+    author = 'Li Fanxi'
+
+    def get_cover_url(self, isbn, br, timeout=5.):
+        try:
+            url = self.DOUBAN_ISBN_URL + isbn + "?apikey=" + self.CALIBRE_DOUBAN_API_KEY
+            src = br.open(url, timeout=timeout).read()
+        except Exception, err:
+            if isinstance(getattr(err, 'args', [None])[0], socket.timeout):
+                err = Exception(_('Douban.com API timed out. Try again later.'))
+            raise err
+        else:
+            feed = etree.fromstring(src)
+            NAMESPACES = {
+              'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
+              'atom' : 'http://www.w3.org/2005/Atom',
+              'db': 'http://www.douban.com/xmlns/'
+            }
+            XPath = partial(etree.XPath, namespaces=NAMESPACES)
+            entries = XPath('//atom:entry')(feed)
+            if len(entries) < 1: 
+                return None
+            try:
+                cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href")
+                u = cover_url(entries[0])[0].replace('/spic/', '/lpic/');
+                # If URL contains "book-default", the book doesn't have a cover
+                if u.find('book-default') != -1:
+                    return None
+            except:
+                return None
+            return u
+
+    def has_cover(self, mi, ans, timeout=5.):
+        print "has_cover called"
+        if not mi.isbn:
+            return False
+        br = browser()
+        try:
+            if self.get_cover_url(mi.isbn, br, timeout=timeout) != None:
+                self.debug('cover for', mi.isbn, 'found')
+                ans.set()
+        except Exception, e:
+            self.debug(e)
+
+    def get_covers(self, mi, result_queue, abort, timeout=5.):
+        if not mi.isbn:
+            return
+        br = browser()
+        try:
+            url = self.get_cover_url(mi.isbn, br, timeout=timeout)
+            cover_data = br.open_novisit(url).read()
+            result_queue.put((True, cover_data, 'jpg', self.name))
+        except Exception, e:
+            result_queue.put((False, self.exception_to_string(e),
+                traceback.format_exc(), self.name))
+# }}}
+
 def download_cover(mi, timeout=5.): # {{{
     results = Queue()
     download_covers(mi, results, max_covers=1, timeout=timeout)

From 04187001a638427a984065dce13c8b56cc60d736 Mon Sep 17 00:00:00 2001
From: Li Fanxi <lifanxi@freemindworld.com>
Date: Mon, 27 Sep 2010 18:56:09 +0800
Subject: [PATCH 14/32] Removed an unnecessary debug print

---
 src/calibre/ebooks/metadata/covers.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/calibre/ebooks/metadata/covers.py b/src/calibre/ebooks/metadata/covers.py
index bef19b4db7..2c071dbbc9 100644
--- a/src/calibre/ebooks/metadata/covers.py
+++ b/src/calibre/ebooks/metadata/covers.py
@@ -256,7 +256,6 @@ class DoubanCovers(CoverDownload): # {{{
             return u
 
     def has_cover(self, mi, ans, timeout=5.):
-        print "has_cover called"
         if not mi.isbn:
             return False
         br = browser()

From 2f04d0b17c5348628e52922d1b7ddeb2cc5da234 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 27 Sep 2010 19:13:57 +0800
Subject: [PATCH 15/32] re-worked unsupported unicode chars Output profile
 option to use Unidecoder to do simple ascii conversion

---
 src/calibre/customize/profiles.py           | 2 +-
 src/calibre/ebooks/conversion/preprocess.py | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py
index e281179565..5fb14988a5 100644
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@@ -426,7 +426,7 @@ class SonyReaderOutput(OutputProfile):
     dpi                       = 168.451
     fbase                     = 12
     fsizes                    = [7.5, 9, 10, 12, 15.5, 20, 22, 24]
-    unsupported_unicode_chars = [[u'\u2018',u'‘'], [u'\u2019',u'’'], [u'\u201a',u'‘'], [u'\u201b',u'’'], [u'\u201c',u'“'], [u'\u201d',u'”'], [u'\u201e',u'“'], [u'\u201f',u'”']]
+    unsupported_unicode_chars = [u'\u201f', u'\u201b']
 
 
 class KoboReaderOutput(OutputProfile):
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 36221f486b..7f384a27bd 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -520,8 +520,12 @@ class HTMLPreProcessor(object):
             html = self.smarten_punctuation(html)
 
         unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
-        for [char, replacement] in unsupported_unicode_chars:
-            html = re.sub('%s' % char, replacement, html)
+        if unsupported_unicode_chars != []:
+            from calibre.ebooks.unidecode.unidecoder import Unidecoder
+            unidecoder = Unidecoder()
+            for char in unsupported_unicode_chars:
+                asciichar = unidecoder.decode(char)
+                html = re.sub('%s' % char, asciichar, html)
 
         return html
 

From f71728110d490b289a4d32d062ff557c93a82eb3 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 27 Sep 2010 14:20:03 -0600
Subject: [PATCH 16/32] Revert removal of inline toc from news downloaded in
 MOBI format

---
 src/calibre/gui2/tools.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/calibre/gui2/tools.py b/src/calibre/gui2/tools.py
index 2f0452a773..c068168247 100644
--- a/src/calibre/gui2/tools.py
+++ b/src/calibre/gui2/tools.py
@@ -217,9 +217,12 @@ def fetch_scheduled_recipe(arg):
     if 'output_profile' in ps:
         recs.append(('output_profile', ps['output_profile'],
             OptionRecommendation.HIGH))
-        if ps['output_profile'] == 'kindle':
-            recs.append(('no_inline_toc', True,
-                OptionRecommendation.HIGH))
+        # Disabled since apparently some people use
+        # K4PC and, surprise, surprise, it doesn't support
+        # indexed MOBIs.
+        #if ps['output_profile'] == 'kindle':
+        #    recs.append(('no_inline_toc', True,
+        #        OptionRecommendation.HIGH))
 
     lf = load_defaults('look_and_feel')
     if lf.get('base_font_size', 0.0) != 0.0:

From e7ddc671c6a903150a88e92e56db4c6f8ed7689a Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 27 Sep 2010 16:13:25 -0600
Subject: [PATCH 17/32] Frederik Pohl's blog by Darko Miletic

---
 resources/recipes/twtfb.recipe | 40 ++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 resources/recipes/twtfb.recipe

diff --git a/resources/recipes/twtfb.recipe b/resources/recipes/twtfb.recipe
new file mode 100644
index 0000000000..bb2bfe2348
--- /dev/null
+++ b/resources/recipes/twtfb.recipe
@@ -0,0 +1,40 @@
+__license__   = 'GPL v3'
+__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+'''
+www.thewaythefutureblogs.com
+Frederik Pohl's Blog
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class TheWayTheFutureBlogs(BasicNewsRecipe):
+    title                 = 'The Way the Future Blogs'
+    __author__            = 'Darko Miletic'
+    description           = "Frederik Pohl's blog"
+    publisher             = 'Frederik Pohl'
+    category              = 'news, SF, books'
+    oldest_article        = 30
+    max_articles_per_feed = 200
+    no_stylesheets        = True
+    encoding              = 'utf8'
+    use_embedded_content  = False
+    language              = 'en'
+    remove_empty_feeds    = True
+    extra_css             = ' body{font-family: Georgia,serif } '
+
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
+    remove_tags   =[dict(name=['meta','object','embed','iframe','base','link'])]
+    keep_only_tags=[dict(attrs={'class':['post','commentlist']})]
+    remove_attributes=['width','height','lang','border']
+
+    feeds = [(u'Posts', u'http://www.thewaythefutureblogs.com/feed/')]
+
+    def preprocess_html(self, soup):
+        for item in soup.findAll(style=True):
+            del item['style']
+        return soup

From f5431765f4340df5fe569c215c8aef370cda5788 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 28 Sep 2010 11:50:39 +0800
Subject: [PATCH 18/32] moved line_length into DocAnalysis class, added
 line_histogram function

---
 src/calibre/ebooks/conversion/preprocess.py | 74 ++++++++++++---------
 src/calibre/ebooks/conversion/utils.py      |  9 +--
 2 files changed, 47 insertions(+), 36 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 7f384a27bd..4a2d56d957 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -61,32 +61,35 @@ def wrap_lines(match):
                return ' '
     else:
                return ital+' '
-
-def line_length(format, raw, percent, test_type):
+               
+class DocAnalysis(object):
     '''
-    Analyses the document to see if hard line breaks exist or to find the 
-    median line length.
+    Provides various text analysis functions to determine how the document is structured.
     format is the type of document analysis will be done against.
     raw is the raw text to determine the line length to use for wrapping.
-    percentage is a decimal number, 0 - 1 which is used to determine
-    how far in the list of line lengths to use. The list of line lengths is
-    ordered smallest to larged and does not include duplicates. 0.5 is the
-    median value.
-    test_type sets whether to use the line length to return the median or a
-    do a histogram analysis to see if unwrapping is required.
+    Blank lines are excluded from analysis
     '''
-    raw = raw.replace('&nbsp;', ' ')
-    if format == 'html':
-        linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
-    elif format == 'pdf':
-        linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
-    elif format == 'spanned_html':
-        linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
-    lines = linere.findall(raw)
 
-    if test_type == 'median':
+    def __init__(self, format='html', raw=''):
+        raw = raw.replace('&nbsp;', ' ')
+        if format == 'html':
+            linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
+        elif format == 'pdf':
+            linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
+        elif format == 'spanned_html':
+            linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
+        self.lines = linere.findall(raw)
+    
+    def line_length(self, percent):
+        '''
+        Analyses the document to find the median line length.
+        percentage is a decimal number, 0 - 1 which is used to determine
+        how far in the list of line lengths to use. The list of line lengths is
+        ordered smallest to larged and does not include duplicates. 0.5 is the
+        median value.
+        '''
         lengths = []
-        for line in lines:
+        for line in self.lines:
             if len(line) > 0:
                 lengths.append(len(line))
 
@@ -111,22 +114,28 @@ def line_length(format, raw, percent, test_type):
         index = int(len(lengths) * percent) - 1
 
         return lengths[index]
-
-    if test_type == 'histogram':
+    
+    def line_histogram(self, percent):
+        '''
+        Creates a broad histogram of the document to determine whether it incorporates hard
+        line breaks.  Lines are sorted into 20 'buckets' based on length.
+        percent is the percentage of lines that should be in a single bucket to return true
+        The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
+        '''
         minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
         maxLineLength=1900 # Discard larger than this to stay in range
         buckets=20 # Each line is divided into a bucket based on length
 
         #print "there are "+str(len(lines))+" lines"
-        max = 0
-        for line in lines:
-            l = len(line)
-            if l > max:
-                max = l
+        #max = 0
+        #for line in self.lines:
+        #    l = len(line)
+        #    if l > max:
+        #        max = l
         #print "max line found is "+str(max)
         # Build the line length histogram
         hRaw = [ 0 for i in range(0,buckets) ]
-        for line in lines:
+        for line in self.lines:
             l = len(line)
             if l > minLineLength and l < maxLineLength:
                     l = int(l/100)
@@ -134,7 +143,7 @@ def line_length(format, raw, percent, test_type):
                     hRaw[l]+=1
 
         # Normalize the histogram into percents
-        totalLines = len(lines)
+        totalLines = len(self.lines)
         h = [ float(count)/totalLines for count in hRaw ]
         #print "\nhRaw histogram lengths are: "+str(hRaw)
         #print "              percents are: "+str(h)+"\n"
@@ -454,15 +463,16 @@ class HTMLPreProcessor(object):
 
         length = -1
         if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
-            length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'), 'median')
+            docanalysis = DocAnalysis('pdf', html)
+            length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
             if length:
                 #print "The pdf line length returned is " + str(length)
+                # unwrap em/en dashes
+                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
                 end_rules.append(
                     # Un wrap using punctuation
                     (re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                 )
-                # unwrap em/en dashes
-                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
 
         for rule in self.PREPROCESS + start_rules:
             html = rule[0].sub(rule[1], html)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 9c57756d28..96df37f631 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
 import re
-from calibre.ebooks.conversion.preprocess import line_length, Dehyphenator
+from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.utils.logging import default_log
 
 class PreProcessor(object):
@@ -204,11 +204,12 @@ class PreProcessor(object):
             format = 'html'
         # Check Line histogram to determine if the document uses hard line breaks, If 50% or 
         # more of the lines break in the same region of the document then unwrapping is required
-        hardbreaks = line_length(format, html, .50, 'histogram')
-        #print "Hard line breaks check returned "+str(hardbreaks)
+        docanalysis = DocAnalysis(format, html)
+        hardbreaks = docanalysis.line_histogram(.50)
+        self.log("Hard line breaks check returned "+str(hardbreaks))
         # Calculate Length
         unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
-        length = line_length(format, html, unwrap_factor, 'median')
+        length = docanalysis.line_length(unwrap_factor)
         self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
         # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
         if hardbreaks or unwrap_factor < 0.4:

From b7f6d820a77c64ae15139ea80870f64922b10823 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 28 Sep 2010 16:07:43 +0800
Subject: [PATCH 19/32] Fixed a UTF-8 decoding error which occurs on bad input
 while removing hyphens, cleaned up indents, regex tweaks

---
 src/calibre/ebooks/conversion/preprocess.py | 27 ++++++++++++---------
 src/calibre/ebooks/conversion/utils.py      |  4 +--
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 4a2d56d957..960dbf0242 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -75,7 +75,7 @@ class DocAnalysis(object):
         if format == 'html':
             linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
         elif format == 'pdf':
-            linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
+            linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
         elif format == 'spanned_html':
             linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
         self.lines = linere.findall(raw)
@@ -191,18 +191,21 @@ class Dehyphenator(object):
         lookupword = self.removesuffixes.sub('', dehyphenated)
         if self.prefixes.match(firsthalf) is None:
            lookupword = self.removeprefix.sub('', lookupword)
-        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
-        booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
+        print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+        try:
+            searchresult = self.html.find(str.lower(lookupword))
+        except:
+            return hyphenated                
         if self.format == 'html_cleanup':
-           if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
-               #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
-               return dehyphenated
-           elif self.html.find(hyphenated) != -1:
-               #print "Cleanup:returned hyphenated word: " + str(hyphenated)
-               return hyphenated
-           else:
-               #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
-               return firsthalf+u'\u2014'+wraptags+secondhalf
+            if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
+                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+                return dehyphenated
+            elif self.html.find(hyphenated) != -1:
+                #print "Cleanup:returned hyphenated word: " + str(hyphenated)
+                return hyphenated
+            else:
+                #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+                return firsthalf+u'\u2014'+wraptags+secondhalf
                
         else:
             if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 96df37f631..b6969a3659 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -145,7 +145,7 @@ class PreProcessor(object):
         #
         # Build the Regular Expressions in pieces
         lookahead = "(?=<(p|div))"
-        chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>span|[ibu])[^>]*>)?\s*(<(?P<inner2>span|[ibu])[^>]*>)?\s*(<(?P<inner3>span|[ibu])[^>]*>)?\s*"
+        chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
         chapter_header_open = r"(?P<chap>"
         chapter_header_close = ")\s*"
         chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)\s[^>]*>)?\s*</(?P=outer)>\s*"
@@ -154,7 +154,7 @@ class PreProcessor(object):
         else:
             blank_lines = ""
         opt_title_open = "("
-        title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>span|[ibu])[^>]*>)?\s*(<(?P<inner5>span|[ibu])[^>]*>)?\s*(<(?P<inner6>span|[ibu])[^>]*>)?\s*"
+        title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
         title_header_open = "(?P<title>"
         title_header_close = ")\s*"
         title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"

From 443d45c5605e00e85e1f1f6d040bbb0da0ccdea3 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 28 Sep 2010 16:21:34 +0800
Subject: [PATCH 20/32] ...

---
 src/calibre/ebooks/conversion/preprocess.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 960dbf0242..6e83146527 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -191,13 +191,13 @@ class Dehyphenator(object):
         lookupword = self.removesuffixes.sub('', dehyphenated)
         if self.prefixes.match(firsthalf) is None:
            lookupword = self.removeprefix.sub('', lookupword)
-        print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
         try:
             searchresult = self.html.find(str.lower(lookupword))
         except:
             return hyphenated                
         if self.format == 'html_cleanup':
-            if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
+            if self.html.find(lookupword) != -1 or searchresult != -1:
                 #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
                 return dehyphenated
             elif self.html.find(hyphenated) != -1:
@@ -208,11 +208,11 @@ class Dehyphenator(object):
                 return firsthalf+u'\u2014'+wraptags+secondhalf
                
         else:
-            if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
+            if self.html.find(lookupword) != -1 or searchresult != -1:
                 #print "returned dehyphenated word: " + str(dehyphenated)
                 return dehyphenated
             else:
-                #print "returned hyphenated word: " + str(hyphenated)
+                #print "           returned hyphenated word: " + str(hyphenated)
                 return hyphenated
 
     def __call__(self, html, format, length=1):

From 7653dfd082fefc33efe8ba1cead481e04566abca Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 28 Sep 2010 17:54:07 +0800
Subject: [PATCH 21/32] switch string to unicode

---
 src/calibre/ebooks/conversion/preprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 6e83146527..2e02a1b90e 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -538,7 +538,7 @@ class HTMLPreProcessor(object):
             unidecoder = Unidecoder()
             for char in unsupported_unicode_chars:
                 asciichar = unidecoder.decode(char)
-                html = re.sub('%s' % char, asciichar, html)
+                html = re.sub(u'%s' % char, asciichar, html)
 
         return html
 

From a0382a8d86c20c386b92a9b52a7a08b65279f22d Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 28 Sep 2010 09:57:49 -0600
Subject: [PATCH 22/32] Fix #6986 (Updated recipe for Telegraph UK)

---
 resources/recipes/telegraph_uk.recipe | 49 +++++++++++----------------
 1 file changed, 20 insertions(+), 29 deletions(-)

diff --git a/resources/recipes/telegraph_uk.recipe b/resources/recipes/telegraph_uk.recipe
index 2c261987b2..f79f0fa50c 100644
--- a/resources/recipes/telegraph_uk.recipe
+++ b/resources/recipes/telegraph_uk.recipe
@@ -1,6 +1,5 @@
-#!/usr/bin/env  python
 __license__   = 'GPL v3'
-__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
 telegraph.co.uk
 '''
@@ -8,14 +7,16 @@ telegraph.co.uk
 from calibre.web.feeds.news import BasicNewsRecipe
 
 class TelegraphUK(BasicNewsRecipe):
-    title                 = u'Telegraph.co.uk'
+    title                 = 'Telegraph.co.uk'
     __author__            = 'Darko Miletic and Sujata Raman'
     description           = 'News from United Kingdom'
-    oldest_article        = 7
+    oldest_article        = 2
+    category              = 'news, politics, UK'
+    publisher             = 'Telegraph Media Group ltd.'    
     max_articles_per_feed = 100
     no_stylesheets        = True
-    language = 'en'
-
+    language              = 'en_GB'
+    remove_empty_feeds    = True
     use_embedded_content  = False
 
     extra_css           = '''
@@ -27,13 +28,20 @@ class TelegraphUK(BasicNewsRecipe):
                         .imageExtras{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
                         '''
 
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
+                        
+                        
     keep_only_tags      = [
-                           dict(name='div', attrs={'class':'storyHead'})
-                          ,dict(name='div', attrs={'class':'story'    })
-                          #,dict(name='div', attrs={'class':['slideshowHD gutterUnder',"twoThirds gutter","caption" ]   })
+                           dict(name='div', attrs={'class':['storyHead','byline']})
+                          ,dict(name='div', attrs={'id':'mainBodyArea'           })
                           ]
-    remove_tags         = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide']})
-                          #,dict(name='div', attrs={'class':['toolshideoneQuarter']})
+    remove_tags         = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide','related_links_video']})
+                          ,dict(name='ul' , attrs={'class':['shareThis shareBottom']})
                           ,dict(name='span', attrs={'class':['num','placeComment']})
                           ]
 
@@ -51,24 +59,7 @@ class TelegraphUK(BasicNewsRecipe):
                          ]
 
     def get_article_url(self, article):
-
-        url = article.get('guid', None)
-
+        url = article.get('link', None)
         if 'picture-galleries' in url or 'pictures' in url or 'picturegalleries' in url :
             url = None
-
         return url
-
-
-    def postprocess_html(self,soup,first):
-
-        for bylineTag in soup.findAll(name='div', attrs={'class':'byline'}):
-            for pTag in bylineTag.findAll(name='p'):
-                if getattr(pTag.contents[0],"Comments",True):
-                    pTag.extract()
-        return soup
-
-
-
-
-

From fdc171a0acc81c367eb0e626d5f3a41d8f48814b Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 28 Sep 2010 13:55:02 -0600
Subject: [PATCH 23/32] Automatically enable the Douban metadata download
 plugins if the user choose chinese as the interface language in the welcome
 wizard

---
 src/calibre/gui2/wizard/__init__.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/calibre/gui2/wizard/__init__.py b/src/calibre/gui2/wizard/__init__.py
index f3234d48d5..8460210cd0 100644
--- a/src/calibre/gui2/wizard/__init__.py
+++ b/src/calibre/gui2/wizard/__init__.py
@@ -584,6 +584,13 @@ class LibraryPage(QWizardPage, LibraryUI):
         qt_app.load_translations()
         self.emit(SIGNAL('retranslate()'))
         self.init_languages()
+        try:
+            if prefs['language'].lower().startswith('zh'):
+                from calibre.customize.ui import enable_plugin
+                for name in ('Douban Books', 'Douban.com covers'):
+                    enable_plugin(name)
+        except:
+            pass
 
     def change(self):
         dir = choose_dir(self, 'database location dialog',

From 5fe81f0162680bc3f3c1dd9bbbf8b826980ad8c6 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 28 Sep 2010 14:24:16 -0600
Subject: [PATCH 24/32] Welcome wizard: Prevent the user from choosing a non
 empty folder as her calibre library

---
 src/calibre/gui2/wizard/__init__.py | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/src/calibre/gui2/wizard/__init__.py b/src/calibre/gui2/wizard/__init__.py
index 8460210cd0..ef58ec3a90 100644
--- a/src/calibre/gui2/wizard/__init__.py
+++ b/src/calibre/gui2/wizard/__init__.py
@@ -592,11 +592,34 @@ class LibraryPage(QWizardPage, LibraryUI):
         except:
             pass
 
+    def is_library_dir_suitable(self, x):
+        return LibraryDatabase2.exists_at(x) or not os.listdir(x)
+
+    def validatePage(self):
+        newloc = unicode(self.location.text())
+        if not self.is_library_dir_suitable(newloc):
+            self.show_library_dir_error(newloc)
+            return False
+        return True
+
     def change(self):
-        dir = choose_dir(self, 'database location dialog',
+        x = choose_dir(self, 'database location dialog',
                          _('Select location for books'))
-        if dir:
-            self.location.setText(dir)
+        if x:
+            if self.is_library_dir_suitable(x):
+                self.location.setText(x)
+            else:
+                self.show_library_dir_error(x)
+
+    def show_library_dir_error(self, x):
+        if not isinstance(x, unicode):
+            try:
+                x = x.decode(filesystem_encoding)
+            except:
+                x = unicode(repr(x))
+        error_dialog(self, _('Bad location'),
+            _('You must choose an empty folder for '
+                'the calibre library. %s is not empty.')%x, show=True)
 
     def initializePage(self):
         lp = prefs['library_path']

From 96bc9f6bec337c2d551a0171f42ca9759d715326 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 28 Sep 2010 16:55:22 -0600
Subject: [PATCH 25/32] Stop metadata backup thread before bulk metadata edits
 to improve performance

---
 src/calibre/gui2/actions/edit_metadata.py |  2 +-
 src/calibre/gui2/dialogs/metadata_bulk.py | 20 ++++++++++++++------
 src/calibre/gui2/library/models.py        | 19 +++++++++++++------
 src/calibre/gui2/preferences/misc.py      | 13 ++++++-------
 4 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/src/calibre/gui2/actions/edit_metadata.py b/src/calibre/gui2/actions/edit_metadata.py
index bd9728989b..cc74b3c515 100644
--- a/src/calibre/gui2/actions/edit_metadata.py
+++ b/src/calibre/gui2/actions/edit_metadata.py
@@ -184,7 +184,7 @@ class EditMetadataAction(InterfaceAction):
         self.gui.tags_view.blockSignals(True)
         try:
             changed = MetadataBulkDialog(self.gui, rows,
-                self.gui.library_view.model().db).changed
+                self.gui.library_view.model()).changed
         finally:
             self.gui.tags_view.blockSignals(False)
         if changed:
diff --git a/src/calibre/gui2/dialogs/metadata_bulk.py b/src/calibre/gui2/dialogs/metadata_bulk.py
index 9c83b3aee5..b0ce0a1e6d 100644
--- a/src/calibre/gui2/dialogs/metadata_bulk.py
+++ b/src/calibre/gui2/dialogs/metadata_bulk.py
@@ -142,12 +142,13 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
                             _('Append to field'),
                         ]
 
-    def __init__(self, window, rows, db):
+    def __init__(self, window, rows, model):
         QDialog.__init__(self, window)
         Ui_MetadataBulkDialog.__init__(self)
         self.setupUi(self)
-        self.db = db
-        self.ids = [db.id(r) for r in rows]
+        self.model = model
+        self.db = model.db
+        self.ids = [self.db.id(r) for r in rows]
         self.box_title.setText('<p>' +
                 _('Editing meta information for <b>%d books</b>') %
                 len(rows))
@@ -170,7 +171,7 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
         self.tag_editor_button.clicked.connect(self.tag_editor)
         self.autonumber_series.stateChanged[int].connect(self.auto_number_changed)
 
-        if len(db.custom_field_keys(include_composites=False)) == 0:
+        if len(self.db.custom_field_keys(include_composites=False)) == 0:
             self.central_widget.removeTab(1)
         else:
             self.create_custom_column_editors()
@@ -617,8 +618,15 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
         self.worker = Worker(args, self.db, self.ids,
                 getattr(self, 'custom_column_widgets', []),
                 Dispatcher(bb.accept, parent=bb))
-        self.worker.start()
-        bb.exec_()
+
+        # The metadata backup thread causes database commits
+        # which can slow down bulk editing of large numbers of books
+        self.model.stop_metadata_backup()
+        try:
+            self.worker.start()
+            bb.exec_()
+        finally:
+            self.model.start_metadata_backup()
 
         if self.worker.error is not None:
             return error_dialog(self, _('Failed'),
diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py
index b2a7f08055..9da5420681 100644
--- a/src/calibre/gui2/library/models.py
+++ b/src/calibre/gui2/library/models.py
@@ -159,17 +159,24 @@ class BooksModel(QAbstractTableModel): # {{{
             # do something on the GUI thread. Deadlock.
         self.cover_cache = CoverCache(db, FunctionDispatcher(self.db.cover))
         self.cover_cache.start()
-        if self.metadata_backup is not None:
-            self.metadata_backup.stop()
-            # Would like to to a join here, but the thread might be waiting to
-            # do something on the GUI thread. Deadlock.
-        self.metadata_backup = MetadataBackup(db)
-        self.metadata_backup.start()
+        self.stop_metadata_backup()
+        self.start_metadata_backup()
         def refresh_cover(event, ids):
             if event == 'cover' and self.cover_cache is not None:
                 self.cover_cache.refresh(ids)
         db.add_listener(refresh_cover)
 
+    def start_metadata_backup(self):
+        self.metadata_backup = MetadataBackup(self.db)
+        self.metadata_backup.start()
+
+    def stop_metadata_backup(self):
+        if getattr(self, 'metadata_backup', None) is not None:
+            self.metadata_backup.stop()
+            # Would like to to a join here, but the thread might be waiting to
+            # do something on the GUI thread. Deadlock.
+
+
     def refresh_ids(self, ids, current_row=-1):
         rows = self.db.refresh_ids(ids)
         if rows:
diff --git a/src/calibre/gui2/preferences/misc.py b/src/calibre/gui2/preferences/misc.py
index 865115c2ed..582d110c6c 100644
--- a/src/calibre/gui2/preferences/misc.py
+++ b/src/calibre/gui2/preferences/misc.py
@@ -106,14 +106,13 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
         d.exec_()
 
     def compact(self, *args):
-        from calibre.library.caches import MetadataBackup
         m = self.gui.library_view.model()
-        if m.metadata_backup is not None:
-            m.metadata_backup.stop()
-        d = CheckIntegrity(m.db, self)
-        d.exec_()
-        m.metadata_backup = MetadataBackup(m.db)
-        m.metadata_backup.start()
+        m.stop_metadata_backup()
+        try:
+            d = CheckIntegrity(m.db, self)
+            d.exec_()
+        finally:
+            m.start_metadata_backup()
 
     def open_config_dir(self, *args):
         from calibre.utils.config import config_dir

From fef738c53b8d5a980423d1930e6a94d4ffc8a6a8 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 28 Sep 2010 18:08:54 -0600
Subject: [PATCH 26/32] ...

---
 src/calibre/manual/faq.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst
index c9f6abe2c0..3cf171bc1b 100644
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@@ -289,7 +289,7 @@ Yes, you can. Follow the instructions in the answer above for adding custom colu
 
 How do I move my |app| library from one computer to another?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Simply copy the |app| library folder from the old to the new computer. You can find out what the library folder is by clicking the calibre icon in the toolbar. The very first item is the path to the library folder. Now on the new computer, start |app| for the first time. It will run the Welcome Wizard asking you for the location of the |app| library. Point it to the previously copied folder. If the computer you are transferring too already has a calibre installation, then the Welcome wizard wont run. In that case, click the calibre icon in the tooolbar and point it to the newly copied directory. You will now have two calibre libraries on your computer and you can switch between them by clicking the calibre icon on the toolbar.
+Simply copy the |app| library folder from the old to the new computer. You can find out what the library folder is by clicking the calibre icon in the toolbar. The very first item is the path to the library folder. Now on the new computer, start |app| for the first time. It will run the Welcome Wizard asking you for the location of the |app| library. Point it to the previously copied folder. If the computer you are transferring to already has a calibre installation, then the Welcome wizard wont run. In that case, click the calibre icon in the tooolbar and point it to the newly copied directory. You will now have two calibre libraries on your computer and you can switch between them by clicking the calibre icon on the toolbar.
 
 Note that if you are transferring between different types of computers (for example Windows to OS X) then after doing the above you should also go to :guilabel:`Preferences->Advanced->Miscellaneous` and click the "Check database integrity button". It will warn you about missing files, if any, which you should then transfer by hand.
 

From ca0e8729d20c1c857254186b9f658edfdbaf0c5e Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 28 Sep 2010 18:15:31 -0600
Subject: [PATCH 27/32] Handle formatting of recursive compisite templates

---
 src/calibre/library/save_to_disk.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/calibre/library/save_to_disk.py b/src/calibre/library/save_to_disk.py
index e479d27121..088b6352af 100644
--- a/src/calibre/library/save_to_disk.py
+++ b/src/calibre/library/save_to_disk.py
@@ -106,18 +106,31 @@ class SafeFormat(TemplateFormatter):
     '''
     Provides a format function that substitutes '' for any missing value
     '''
+
+    composite_values = {}
+
     def get_value(self, key, args, kwargs):
         try:
             b = self.book.get_user_metadata(key, False)
             key = key.lower()
             if b is not None and b['datatype'] == 'composite':
-                return self.vformat(b['display']['composite_template'], [], kwargs)
+                if key in self.composite_values:
+                    return self.composite_values[key]
+                self.composite_values[key] = 'RECURSIVE_COMPOSITE FIELD (S2D) ' + key
+                self.composite_values[key] = \
+                    self.vformat(b['display']['composite_template'], [], kwargs)
+                return self.composite_values[key]
             if kwargs[key]:
                 return self.sanitize(kwargs[key.lower()])
             return ''
         except:
             return ''
 
+    def safe_format(self, fmt, kwargs, error_value, book, sanitize=None):
+        self.composite_values = {}
+        return TemplateFormatter.safe_format(self, fmt, kwargs, error_value,
+                                             book, sanitize)
+
 safe_formatter = SafeFormat()
 
 def get_components(template, mi, id, timefmt='%b %Y', length=250,

From c8477338a68b55a5fb92af145a516785a96ab273 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 28 Sep 2010 18:21:52 -0600
Subject: [PATCH 28/32] Do not have the fetch news dialog close when the user
 presses Enter

---
 src/calibre/gui2/dialogs/scheduler.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/calibre/gui2/dialogs/scheduler.py b/src/calibre/gui2/dialogs/scheduler.py
index fd8184933f..30f4a2d8a2 100644
--- a/src/calibre/gui2/dialogs/scheduler.py
+++ b/src/calibre/gui2/dialogs/scheduler.py
@@ -57,6 +57,10 @@ class SchedulerDialog(QDialog, Ui_Dialog):
 
         self.old_news.setValue(gconf['oldest_news'])
 
+    def keyPressEvent(self, ev):
+        if ev.key() not in (Qt.Key_Enter, Qt.Key_Return):
+            return QDialog.keyPressEvent(self, ev)
+
     def break_cycles(self):
         self.disconnect(self.recipe_model,  SIGNAL('searched(PyQt_PyObject)'),
                 self.search_done)

From d3053b8a8612d24aec9c81dafd5567defdb37a50 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 28 Sep 2010 18:45:41 -0600
Subject: [PATCH 29/32] Support for the JetBook Mini

---
 src/calibre/customize/builtins.py     |  3 ++-
 src/calibre/devices/__init__.py       | 12 +++++++++---
 src/calibre/devices/jetbook/driver.py | 23 +++++++++++++++++++++++
 src/calibre/gui2/wizard/__init__.py   |  8 ++++++++
 4 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index ef3da9ce20..50d8e29373 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -446,7 +446,7 @@ from calibre.devices.eb600.driver import EB600, COOL_ER, SHINEBOOK, \
                 BOOQ, ELONEX, POCKETBOOK301, MENTOR
 from calibre.devices.iliad.driver import ILIAD
 from calibre.devices.irexdr.driver import IREXDR1000, IREXDR800
-from calibre.devices.jetbook.driver import JETBOOK, MIBUK
+from calibre.devices.jetbook.driver import JETBOOK, MIBUK, JETBOOK_MINI
 from calibre.devices.kindle.driver import KINDLE, KINDLE2, KINDLE_DX
 from calibre.devices.nook.driver import NOOK
 from calibre.devices.prs505.driver import PRS505
@@ -520,6 +520,7 @@ plugins += [
     IREXDR1000,
     IREXDR800,
     JETBOOK,
+    JETBOOK_MINI,
     MIBUK,
     SHINEBOOK,
     POCKETBOOK360,
diff --git a/src/calibre/devices/__init__.py b/src/calibre/devices/__init__.py
index 956d18e903..24e606e022 100644
--- a/src/calibre/devices/__init__.py
+++ b/src/calibre/devices/__init__.py
@@ -95,13 +95,19 @@ def debug(ioreg_to_tmp=False, buf=None):
             ioreg += 'Output from osx_get_usb_drives:\n'+drives+'\n\n'
             ioreg += Device.run_ioreg()
         connected_devices = []
-        for dev in sorted(device_plugins(), cmp=lambda
-                x,y:cmp(x.__class__.__name__, y.__class__.__name__)):
-            out('Looking for', dev.__class__.__name__)
+        devplugins = list(sorted(device_plugins(), cmp=lambda
+                x,y:cmp(x.__class__.__name__, y.__class__.__name__)))
+        out('Available plugins:', ' '.join([x.__class__.__name__ for x in
+            devplugins]))
+        out(' ')
+        out('Looking for devices...')
+        for dev in devplugins:
             connected, det = s.is_device_connected(dev, debug=True)
             if connected:
+                out('\t\tDetected possible device', dev.__class__.__name__)
                 connected_devices.append((dev, det))
 
+        out(' ')
         errors = {}
         success = False
         out('Devices possibly connected:', end=' ')
diff --git a/src/calibre/devices/jetbook/driver.py b/src/calibre/devices/jetbook/driver.py
index 6ee1c07464..5fd3929aaf 100644
--- a/src/calibre/devices/jetbook/driver.py
+++ b/src/calibre/devices/jetbook/driver.py
@@ -99,4 +99,27 @@ class MIBUK(USBMS):
     VENDOR_NAME      = 'LINUX'
     WINDOWS_MAIN_MEM = 'WOLDERMIBUK'
 
+class JETBOOK_MINI(USBMS):
+
+    '''
+    ['0x4b8',
+  '0x507',
+  '0x100',
+  'ECTACO',
+  'ECTACO ATA/ATAPI Bridge (Bulk-Only)',
+  'Rev.0.20']
+    '''
+    FORMATS     = ['fb2', 'txt']
+
+    name = 'JetBook Mini'
+    description    = _('Communicate with the JetBook Mini reader.')
+    author         = 'Kovid Goyal'
+
+    VENDOR_ID = [0x4b8]
+    PRODUCT_ID = [0x507]
+    BCD = [0x100]
+    VENDOR_NAME      = 'ECTACO'
+    WINDOWS_MAIN_MEM = '' # Matches PROD_
+    SUPPORTS_SUB_DIRS = True
+
 
diff --git a/src/calibre/gui2/wizard/__init__.py b/src/calibre/gui2/wizard/__init__.py
index ef58ec3a90..37b7c7bd7c 100644
--- a/src/calibre/gui2/wizard/__init__.py
+++ b/src/calibre/gui2/wizard/__init__.py
@@ -73,6 +73,14 @@ class JetBook(Device):
     manufacturer = 'Ectaco'
     id = 'jetbook'
 
+class JetBookMini(Device):
+
+    output_profile = 'jetbook5'
+    output_format  = 'FB2'
+    name = 'JetBook Mini'
+    manufacturer = 'Ectaco'
+    id = 'jetbookmini'
+
 class KindleDX(Kindle):
 
     output_profile = 'kindle_dx'

From fce4ab97b696ef3d2addb04643980266272b4380 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 28 Sep 2010 18:50:02 -0600
Subject: [PATCH 30/32] ...

---
 src/calibre/devices/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/calibre/devices/__init__.py b/src/calibre/devices/__init__.py
index 24e606e022..1918a36cc8 100644
--- a/src/calibre/devices/__init__.py
+++ b/src/calibre/devices/__init__.py
@@ -56,6 +56,7 @@ def get_connected_device():
     return dev
 
 def debug(ioreg_to_tmp=False, buf=None):
+    import textwrap
     from calibre.customize.ui import device_plugins
     from calibre.devices.scanner import DeviceScanner, win_pnp_drives
     from calibre.constants import iswindows, isosx, __version__
@@ -97,8 +98,8 @@ def debug(ioreg_to_tmp=False, buf=None):
         connected_devices = []
         devplugins = list(sorted(device_plugins(), cmp=lambda
                 x,y:cmp(x.__class__.__name__, y.__class__.__name__)))
-        out('Available plugins:', ' '.join([x.__class__.__name__ for x in
-            devplugins]))
+        out('Available plugins:', textwrap.fill(' '.join([x.__class__.__name__ for x in
+            devplugins])))
         out(' ')
         out('Looking for devices...')
         for dev in devplugins:

From 1c9335aa5ec10a1bc2dba97bed55513c9550669f Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 28 Sep 2010 19:03:54 -0600
Subject: [PATCH 31/32] Fix regression that caused the filename to not be set
 as the title when reading metadata fails

---
 src/calibre/ebooks/metadata/meta.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/metadata/meta.py b/src/calibre/ebooks/metadata/meta.py
index 68deca5e10..b02ae2dbff 100644
--- a/src/calibre/ebooks/metadata/meta.py
+++ b/src/calibre/ebooks/metadata/meta.py
@@ -181,7 +181,7 @@ def metadata_from_filename(name, pat=None):
             mi.isbn = si
         except (IndexError, ValueError):
             pass
-    if not mi.title:
+    if mi.is_null('title'):
         mi.title = name
     return mi
 

From 3018b6ac7c4f7b026d5ca847734653faa1e4d0b7 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 28 Sep 2010 19:07:08 -0600
Subject: [PATCH 32/32] ...

---
 src/calibre/devices/jetbook/driver.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/calibre/devices/jetbook/driver.py b/src/calibre/devices/jetbook/driver.py
index 5fd3929aaf..f108de3347 100644
--- a/src/calibre/devices/jetbook/driver.py
+++ b/src/calibre/devices/jetbook/driver.py
@@ -111,7 +111,8 @@ class JETBOOK_MINI(USBMS):
     '''
     FORMATS     = ['fb2', 'txt']
 
-    name = 'JetBook Mini'
+    gui_name = 'JetBook Mini'
+    name = 'JetBook Mini Device Interface'
     description    = _('Communicate with the JetBook Mini reader.')
     author         = 'Kovid Goyal'
 
@@ -120,6 +121,8 @@ class JETBOOK_MINI(USBMS):
     BCD = [0x100]
     VENDOR_NAME      = 'ECTACO'
     WINDOWS_MAIN_MEM = '' # Matches PROD_
+    MAIN_MEMORY_VOLUME_LABEL  = 'Jetbook Mini'
+
     SUPPORTS_SUB_DIRS = True