merge from trunk

2025-07-09 03:04:10 -04:00 · 2010-09-29 07:42:23 +01:00 · 2010-09-29 07:42:23 +01:00 · 9c3d85d4a5
commit 9c3d85d4a5
parent 8a94c2194e 3018b6ac7c
21 changed files with 503 additions and 174 deletions
--- a/resources/recipes/ajc.recipe
+++ b/resources/recipes/ajc.recipe
@ -1,6 +1,6 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
-__author__    = 'Tony Stegall' 
+__author__    = 'Tony Stegall'
 __copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com'
 __version__   = '1.03'
 __date__      = '27, September 2010'
@ -9,6 +9,8 @@ __docformat__ = 'restructuredtext en'


 import datetime
+from calibre.web.feeds.news import BasicNewsRecipe
+
 class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    now = datetime.datetime.now()
    title = 'The AJC'
@ -20,39 +22,39 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    oldest_article = 1
    max_articles_per_feed = 100
    no_stylesheets = True
-    
+
    masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif'
    extra_css = '''
                    h1.articleHeadline{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
                    h2.articleSubheadline{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
-                    
+
                    p.byline{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;}
                    p.organization{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;}
-                    
-                    
+
+
                    p{font-family:Helvetica,Arial,sans-serif;font-size:small;}
 		        '''
-    
-    
+
+
    keep_only_tags    = [
                        dict(name='div', attrs={'class':['cxArticleHeader']})
                       ,dict(attrs={'id':['cxArticleText']})
                        ]
-    
-                  
+
+
    remove_tags = [
                     dict(name='div'  , attrs={'class':'cxArticleList'       })
                    ,dict(name='div'  , attrs={'class':'cxFeedTease' })
                    ,dict(name='div'  , attrs={'class':'cxElementEnlarge'  })
                    ,dict(name='div'  , attrs={'id':'cxArticleTools'  })
                  ]
-              
-                  
-                  
+
+
+
    feeds          = [
                      ('Breaking News', 'http://www.ajc.com/genericList-rss.do?source=61499'),
                      # -------------------------------------------------------------------
-                      # Here are the different area feeds. Choose which ever one you wish to 
+                      # Here are the different area feeds. Choose which ever one you wish to
                      # read by simply removing the pound sign from it.  I currently have it
                      # set to only get the Cobb area
                      # --------------------------------------------------------------------
@ -70,7 +72,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
                       ('Opinions', 'http://www.ajc.com/section-rss.do?source=opinion'),
                       ('Ga Politics', 'http://www.ajc.com/section-rss.do?source=georgia-politics-elections'),
                      # ------------------------------------------------------------------------
-                      # Here are the different sports feeds. I only follow the Falcons, and Highschool 
+                      # Here are the different sports feeds. I only follow the Falcons, and Highschool
                      # but again
                      # You can enable which ever team you like by removing the pound sign
                      # ------------------------------------------------------------------------
@ -85,25 +87,25 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
                       ('Music', 'http://www.accessatlanta.com/section-rss.do?source=music'),
                    ]

-   
+

    def postprocess_html(self, soup, first):
      for credit_tag in soup.findAll('span', attrs={'class':['imageCredit rightFloat']}):
       credit_tag.extract()
-      
+
      return soup
-       
+
   #def print_version(self, url):
   #     return url.partition('?')[0] +'?printArticle=y'
-   
-      
-       
-    





-    
- 
+
+
+
+
+
+

--- a/resources/recipes/boortz.recipe
+++ b/resources/recipes/boortz.recipe
@ -1,6 +1,6 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
-__author__    = 'Tony Stegall' 
+__author__    = 'Tony Stegall'
 __copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com'
 __version__   = '1.04'
 __date__      = '27, September 2010'
@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'


 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
+
 class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    title = 'Nealz Nuze'
    language = 'en'
@ -18,7 +18,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    category = 'news, politics, USA, talkshow'
    oldest_article = 1
    max_articles_per_feed = 100
-    
+
    no_stylesheets = True
    remove_javascript   = True
    use_embedded_content = True
@ -26,5 +26,5 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    conversion_options = {'linearize_tables' : True}
    feeds          = [
                      ('NUZE', 'http://boortz.com/nealz_nuze_rss/rss.xml')
-                      
+
                    ]
--- a/resources/recipes/popscience.recipe
+++ b/resources/recipes/popscience.recipe
@ -1,5 +1,5 @@
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import re
+import re

 class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    title = 'Popular Science'
--- a/resources/recipes/telegraph_uk.recipe
+++ b/resources/recipes/telegraph_uk.recipe
@ -1,6 +1,5 @@
-#!/usr/bin/env  python
 __license__   = 'GPL v3'
-__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
 telegraph.co.uk
 '''
@ -8,14 +7,16 @@ telegraph.co.uk
 from calibre.web.feeds.news import BasicNewsRecipe

 class TelegraphUK(BasicNewsRecipe):
-    title                 = u'Telegraph.co.uk'
+    title                 = 'Telegraph.co.uk'
    __author__            = 'Darko Miletic and Sujata Raman'
    description           = 'News from United Kingdom'
-    oldest_article        = 7
+    oldest_article        = 2
+    category              = 'news, politics, UK'
+    publisher             = 'Telegraph Media Group ltd.'    
    max_articles_per_feed = 100
    no_stylesheets        = True
-    language = 'en'
-
+    language              = 'en_GB'
+    remove_empty_feeds    = True
    use_embedded_content  = False

    extra_css           = '''
@ -27,13 +28,20 @@ class TelegraphUK(BasicNewsRecipe):
                        .imageExtras{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
                        '''

+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
+                        
+                        
    keep_only_tags      = [
-                           dict(name='div', attrs={'class':'storyHead'})
-                          ,dict(name='div', attrs={'class':'story'    })
-                          #,dict(name='div', attrs={'class':['slideshowHD gutterUnder',"twoThirds gutter","caption" ]   })
+                           dict(name='div', attrs={'class':['storyHead','byline']})
+                          ,dict(name='div', attrs={'id':'mainBodyArea'           })
                          ]
-    remove_tags         = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide']})
-                          #,dict(name='div', attrs={'class':['toolshideoneQuarter']})
+    remove_tags         = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide','related_links_video']})
+                          ,dict(name='ul' , attrs={'class':['shareThis shareBottom']})
                          ,dict(name='span', attrs={'class':['num','placeComment']})
                          ]

@ -51,24 +59,7 @@ class TelegraphUK(BasicNewsRecipe):
                         ]

    def get_article_url(self, article):
-
-        url = article.get('guid', None)
-
+        url = article.get('link', None)
        if 'picture-galleries' in url or 'pictures' in url or 'picturegalleries' in url :
            url = None
-
        return url
-
-
-    def postprocess_html(self,soup,first):
-
-        for bylineTag in soup.findAll(name='div', attrs={'class':'byline'}):
-            for pTag in bylineTag.findAll(name='p'):
-                if getattr(pTag.contents[0],"Comments",True):
-                    pTag.extract()
-        return soup
-
-
-
-
-
--- a/resources/recipes/twtfb.recipe
+++ b/resources/recipes/twtfb.recipe
@ -0,0 +1,40 @@
+__license__   = 'GPL v3'
+__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+'''
+www.thewaythefutureblogs.com
+Frederik Pohl's Blog
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class TheWayTheFutureBlogs(BasicNewsRecipe):
+    title                 = 'The Way the Future Blogs'
+    __author__            = 'Darko Miletic'
+    description           = "Frederik Pohl's blog"
+    publisher             = 'Frederik Pohl'
+    category              = 'news, SF, books'
+    oldest_article        = 30
+    max_articles_per_feed = 200
+    no_stylesheets        = True
+    encoding              = 'utf8'
+    use_embedded_content  = False
+    language              = 'en'
+    remove_empty_feeds    = True
+    extra_css             = ' body{font-family: Georgia,serif } '
+
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
+    remove_tags   =[dict(name=['meta','object','embed','iframe','base','link'])]
+    keep_only_tags=[dict(attrs={'class':['post','commentlist']})]
+    remove_attributes=['width','height','lang','border']
+
+    feeds = [(u'Posts', u'http://www.thewaythefutureblogs.com/feed/')]
+
+    def preprocess_html(self, soup):
+        for item in soup.findAll(style=True):
+            del item['style']
+        return soup
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -446,7 +446,7 @@ from calibre.devices.eb600.driver import EB600, COOL_ER, SHINEBOOK, \
                BOOQ, ELONEX, POCKETBOOK301, MENTOR
 from calibre.devices.iliad.driver import ILIAD
 from calibre.devices.irexdr.driver import IREXDR1000, IREXDR800
-from calibre.devices.jetbook.driver import JETBOOK, MIBUK
+from calibre.devices.jetbook.driver import JETBOOK, MIBUK, JETBOOK_MINI
 from calibre.devices.kindle.driver import KINDLE, KINDLE2, KINDLE_DX
 from calibre.devices.nook.driver import NOOK
 from calibre.devices.prs505.driver import PRS505
@ -468,14 +468,14 @@ from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \
    LibraryThing
 from calibre.ebooks.metadata.douban import DoubanBooks
 from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
-        LibraryThingCovers
+        LibraryThingCovers, DoubanCovers
 from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX
 from calibre.ebooks.epub.fix.unmanifested import Unmanifested
 from calibre.ebooks.epub.fix.epubcheck import Epubcheck

 plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon,
        LibraryThing, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
-        Epubcheck, OpenLibraryCovers, LibraryThingCovers]
+        Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers]
 plugins += [
    ComicInput,
    EPUBInput,
@ -520,6 +520,7 @@ plugins += [
    IREXDR1000,
    IREXDR800,
    JETBOOK,
+    JETBOOK_MINI,
    MIBUK,
    SHINEBOOK,
    POCKETBOOK360,
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@ -1,3 +1,4 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import with_statement
 __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
@ -251,6 +252,9 @@ class OutputProfile(Plugin):
    #: The character used to represent a star in ratings
    ratings_char = u'*'

+    #: Unsupported unicode characters to be replaced during preprocessing
+    unsupported_unicode_chars = []
+
    @classmethod
    def tags_to_string(cls, tags):
        return escape(', '.join(tags))
@ -422,6 +426,8 @@ class SonyReaderOutput(OutputProfile):
    dpi                       = 168.451
    fbase                     = 12
    fsizes                    = [7.5, 9, 10, 12, 15.5, 20, 22, 24]
+    unsupported_unicode_chars = [u'\u201f', u'\u201b']
+

 class KoboReaderOutput(OutputProfile):

--- a/src/calibre/customize/ui.py
+++ b/src/calibre/customize/ui.py
@ -120,7 +120,7 @@ def enable_plugin(plugin_or_name):
    config['enabled_plugins'] = ep

 default_disabled_plugins = set([
-    'Douban Books',
+    'Douban Books', 'Douban.com covers',
 ])

 def is_disabled(plugin):
--- a/src/calibre/devices/init.py
+++ b/src/calibre/devices/init.py
@ -56,6 +56,7 @@ def get_connected_device():
    return dev

 def debug(ioreg_to_tmp=False, buf=None):
+    import textwrap
    from calibre.customize.ui import device_plugins
    from calibre.devices.scanner import DeviceScanner, win_pnp_drives
    from calibre.constants import iswindows, isosx, __version__
@ -95,13 +96,19 @@ def debug(ioreg_to_tmp=False, buf=None):
            ioreg += 'Output from osx_get_usb_drives:\n'+drives+'\n\n'
            ioreg += Device.run_ioreg()
        connected_devices = []
-        for dev in sorted(device_plugins(), cmp=lambda
-                x,y:cmp(x.__class__.__name__, y.__class__.__name__)):
-            out('Looking for', dev.__class__.__name__)
+        devplugins = list(sorted(device_plugins(), cmp=lambda
+                x,y:cmp(x.__class__.__name__, y.__class__.__name__)))
+        out('Available plugins:', textwrap.fill(' '.join([x.__class__.__name__ for x in
+            devplugins])))
+        out(' ')
+        out('Looking for devices...')
+        for dev in devplugins:
            connected, det = s.is_device_connected(dev, debug=True)
            if connected:
+                out('\t\tDetected possible device', dev.__class__.__name__)
                connected_devices.append((dev, det))

+        out(' ')
        errors = {}
        success = False
        out('Devices possibly connected:', end=' ')
--- a/src/calibre/devices/jetbook/driver.py
+++ b/src/calibre/devices/jetbook/driver.py
@ -99,4 +99,30 @@ class MIBUK(USBMS):
    VENDOR_NAME      = 'LINUX'
    WINDOWS_MAIN_MEM = 'WOLDERMIBUK'

+class JETBOOK_MINI(USBMS):
+
+    '''
+    ['0x4b8',
+  '0x507',
+  '0x100',
+  'ECTACO',
+  'ECTACO ATA/ATAPI Bridge (Bulk-Only)',
+  'Rev.0.20']
+    '''
+    FORMATS     = ['fb2', 'txt']
+
+    gui_name = 'JetBook Mini'
+    name = 'JetBook Mini Device Interface'
+    description    = _('Communicate with the JetBook Mini reader.')
+    author         = 'Kovid Goyal'
+
+    VENDOR_ID = [0x4b8]
+    PRODUCT_ID = [0x507]
+    BCD = [0x100]
+    VENDOR_NAME      = 'ECTACO'
+    WINDOWS_MAIN_MEM = '' # Matches PROD_
+    MAIN_MEMORY_VOLUME_LABEL  = 'Jetbook Mini'
+
+    SUPPORTS_SUB_DIRS = True
+

--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -62,49 +62,104 @@ def wrap_lines(match):
    else:
               return ital+' '

-def line_length(format, raw, percent):
+class DocAnalysis(object):
    '''
-    raw is the raw text to find the line length to use for wrapping.
-    percentage is a decimal number, 0 - 1 which is used to determine
-    how far in the list of line lengths to use. The list of line lengths is
-    ordered smallest to larged and does not include duplicates. 0.5 is the
-    median value.
+    Provides various text analysis functions to determine how the document is structured.
+    format is the type of document analysis will be done against.
+    raw is the raw text to determine the line length to use for wrapping.
+    Blank lines are excluded from analysis
    '''
-    raw = raw.replace('&nbsp;', ' ')
-    if format == 'html':
-        linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
-    elif format == 'pdf':
-        linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
-    elif format == 'spanned_html':
-        linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
-    lines = linere.findall(raw)

-    lengths = []
-    for line in lines:
-        if len(line) > 0:
-            lengths.append(len(line))
+    def __init__(self, format='html', raw=''):
+        raw = raw.replace('&nbsp;', ' ')
+        if format == 'html':
+            linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
+        elif format == 'pdf':
+            linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
+        elif format == 'spanned_html':
+            linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
+        self.lines = linere.findall(raw)

-    if not lengths:
-        return 0
+    def line_length(self, percent):
+        '''
+        Analyses the document to find the median line length.
+        percentage is a decimal number, 0 - 1 which is used to determine
+        how far in the list of line lengths to use. The list of line lengths is
+        ordered smallest to larged and does not include duplicates. 0.5 is the
+        median value.
+        '''
+        lengths = []
+        for line in self.lines:
+            if len(line) > 0:
+                lengths.append(len(line))

-    lengths = list(set(lengths))
-    total = sum(lengths)
-    avg = total / len(lengths)
-    max_line = avg * 2
+        if not lengths:
+            return 0

-    lengths = sorted(lengths)
-    for i in range(len(lengths) - 1, -1, -1):
-        if lengths[i] > max_line:
-            del lengths[i]
+        lengths = list(set(lengths))
+        total = sum(lengths)
+        avg = total / len(lengths)
+        max_line = avg * 2

-    if percent > 1:
-        percent = 1
-    if percent < 0:
-        percent = 0
+        lengths = sorted(lengths)
+        for i in range(len(lengths) - 1, -1, -1):
+            if lengths[i] > max_line:
+                del lengths[i]

-    index = int(len(lengths) * percent) - 1
+        if percent > 1:
+            percent = 1
+        if percent < 0:
+            percent = 0

-    return lengths[index]
+        index = int(len(lengths) * percent) - 1
+
+        return lengths[index]
+
+    def line_histogram(self, percent):
+        '''
+        Creates a broad histogram of the document to determine whether it incorporates hard
+        line breaks.  Lines are sorted into 20 'buckets' based on length.
+        percent is the percentage of lines that should be in a single bucket to return true
+        The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
+        '''
+        minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
+        maxLineLength=1900 # Discard larger than this to stay in range
+        buckets=20 # Each line is divided into a bucket based on length
+
+        #print "there are "+str(len(lines))+" lines"
+        #max = 0
+        #for line in self.lines:
+        #    l = len(line)
+        #    if l > max:
+        #        max = l
+        #print "max line found is "+str(max)
+        # Build the line length histogram
+        hRaw = [ 0 for i in range(0,buckets) ]
+        for line in self.lines:
+            l = len(line)
+            if l > minLineLength and l < maxLineLength:
+                    l = int(l/100)
+                    #print "adding "+str(l)
+                    hRaw[l]+=1
+
+        # Normalize the histogram into percents
+        totalLines = len(self.lines)
+        h = [ float(count)/totalLines for count in hRaw ]
+        #print "\nhRaw histogram lengths are: "+str(hRaw)
+        #print "              percents are: "+str(h)+"\n"
+
+        # Find the biggest bucket
+        maxValue = 0
+        for i in range(0,len(h)):
+            if h[i] > maxValue:
+                maxValue = h[i]
+
+        if maxValue < percent:
+            #print "Line lengths are too variable. Not unwrapping."
+            return False
+        else:
+            #print str(maxValue)+" of the lines were in one bucket"
+            return True

 class Dehyphenator(object):
    '''
@ -117,42 +172,62 @@ class Dehyphenator(object):
    def __init__(self):
        # Add common suffixes to the regex below to increase the likelihood of a match -
        # don't add suffixes which are also complete words, such as 'able' or 'sex'
-        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
        # remove prefixes if the prefix was not already the point of hyphenation
-        self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
-        self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
+        self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
+        self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)

    def dehyphenate(self, match):
        firsthalf = match.group('firstpart')
        secondhalf = match.group('secondpart')
+        try:
+            wraptags = match.group('wraptags')
+        except:
+            wraptags = ''
        hyphenated = str(firsthalf) + "-" + str(secondhalf)
        dehyphenated = str(firsthalf) + str(secondhalf)
        lookupword = self.removesuffixes.sub('', dehyphenated)
        if self.prefixes.match(firsthalf) is None:
           lookupword = self.removeprefix.sub('', lookupword)
-        booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
-        match = booklookup.search(self.html)
-        if match:
-            #print "returned dehyphenated word: " + str(dehyphenated)
-            return dehyphenated
-        else:
-            #print "returned hyphenated word: " + str(hyphenated)
+        try:
+            searchresult = self.html.find(str.lower(lookupword))
+        except:
            return hyphenated
+        if self.format == 'html_cleanup':
+            if self.html.find(lookupword) != -1 or searchresult != -1:
+                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+                return dehyphenated
+            elif self.html.find(hyphenated) != -1:
+                #print "Cleanup:returned hyphenated word: " + str(hyphenated)
+                return hyphenated
+            else:
+                #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+                return firsthalf+u'\u2014'+wraptags+secondhalf
+
+        else:
+            if self.html.find(lookupword) != -1 or searchresult != -1:
+                #print "returned dehyphenated word: " + str(dehyphenated)
+                return dehyphenated
+            else:
+                #print "           returned hyphenated word: " + str(hyphenated)
+                return hyphenated

    def __call__(self, html, format, length=1):
        self.html = html
+        self.format = format
        if format == 'html':
-            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
        elif format == 'pdf':
-            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(<p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
        elif format == 'individual_words':
-            intextmatch = re.compile('>[^<]*\b(?P<firstpart>[^"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
+            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
+        elif format == 'html_cleanup':
+            intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')

        html = intextmatch.sub(self.dehyphenate, html)
        return html

-
 class CSSPreProcessor(object):

    PAGE_PAT   = re.compile(r'@page[^{]*?{[^}]*?}')
@ -286,7 +361,7 @@ class HTMLPreProcessor(object):
                  (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),

                  # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
+                  (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
                  # Cover the case where every letter in a chapter title is separated by a space
                  (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),

@ -374,10 +449,8 @@ class HTMLPreProcessor(object):
                print 'Failed to parse remove_footer regexp'
                traceback.print_exc()

-        # unwrap em/en dashes, delete soft hyphens - moved here so it's executed after header/footer removal
+        # delete soft hyphens - moved here so it's executed after header/footer removal
        if is_pdftohtml:
-            # unwrap em/en dashes
-            end_rules.append((re.compile(u'(?<=[–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
            # unwrap/delete soft hyphens
            end_rules.append((re.compile(u'[](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
            # unwrap/delete soft hyphens with formatting
@ -391,12 +464,15 @@ class HTMLPreProcessor(object):

        length = -1
        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
-            length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
+            docanalysis = DocAnalysis('pdf', html)
+            length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
            if length:
-                # print "The pdf line length returned is " + str(length)
+                #print "The pdf line length returned is " + str(length)
+                # unwrap em/en dashes
+                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
                end_rules.append(
                    # Un wrap using punctuation
-                    (re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                )

        for rule in self.PREPROCESS + start_rules:
@ -454,6 +530,14 @@ class HTMLPreProcessor(object):
        if getattr(self.extra_opts, 'smarten_punctuation', False):
            html = self.smarten_punctuation(html)

+        unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
+        if unsupported_unicode_chars:
+            from calibre.ebooks.unidecode.unidecoder import Unidecoder
+            unidecoder = Unidecoder()
+            for char in unsupported_unicode_chars:
+                asciichar = unidecoder.decode(char)
+                html = html.replace(char, asciichar)
+
        return html

    def smarten_punctuation(self, html):
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

 import re
-from calibre.ebooks.conversion.preprocess import line_length, Dehyphenator
+from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.utils.logging import default_log

 class PreProcessor(object):
@ -77,13 +77,18 @@ class PreProcessor(object):

    def __call__(self, html):
        self.log("*********  Preprocessing HTML  *********")
+
+        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
+        html = re.sub(r"\s*</p>", "</p>\n", html)
+        html = re.sub(r"\s*<p>\s*", "\n<p>", html)
+
        ###### Check Markup ######
        #
        # some lit files don't have any <p> tags or equivalent (generally just plain text between
        # <pre> tags), check and  mark up line endings if required before proceeding
        if self.no_markup(html, 0.1):
             self.log("not enough paragraph markers, adding now")
-             # check if content is in pre tags, use txt procesor to mark up if so
+             # check if content is in pre tags, use txt processor to mark up if so
             pre = re.compile(r'<pre>', re.IGNORECASE)
             if len(pre.findall(html)) == 1:
                 self.log("Running Text Processing")
@ -113,47 +118,77 @@ class PreProcessor(object):
        # Get rid of empty <o:p> tags to simplify other processing
        html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
        # Get rid of empty span, bold, & italics tags
-        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
+        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
        html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)

-        # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+        # If more than 40% of the lines are empty paragraphs and the user has enabled remove
+        # paragraph spacing then delete blank lines to clean up spacing
        linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
        blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
        #multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
        blanklines = blankreg.findall(html)
        lines = linereg.findall(html)
+        blanks_between_paragraphs = False
        if len(lines) > 1:
            self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
            if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
            'remove_paragraph_spacing', False):
                self.log("deleting blank lines")
                html = blankreg.sub('', html)
-        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
-        html = re.sub(r"\s*</p>", "</p>\n", html)
-        html = re.sub(r"\s*<p>\s*", "\n<p>", html)
+            elif float(len(blanklines)) / float(len(lines)) > 0.40:
+               blanks_between_paragraphs = True
+               #print "blanks between paragraphs is marked True"
+            else:
+                blanks_between_paragraphs = False
+        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
        # detect chapters/sections to match xpath or splitting logic
+        #
+        # Build the Regular Expressions in pieces
+        lookahead = "(?=<(p|div))"
+        chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
+        chapter_header_open = r"(?P<chap>"
+        chapter_header_close = ")\s*"
+        chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)\s[^>]*>)?\s*</(?P=outer)>\s*"
+        if blanks_between_paragraphs:
+            blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
+        else:
+            blank_lines = ""
+        opt_title_open = "("
+        title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
+        title_header_open = "(?P<title>"
+        title_header_close = ")\s*"
+        title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"
+        opt_title_close = ")?"
+
+        default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
+        typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
+        numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
+        uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
+
+        chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+        #print chapter_marker
        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
        self.html_preprocess_sections = len(heading.findall(html))
        self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
        #
        # Start with most typical chapter headings, get more aggressive until one works
        if self.html_preprocess_sections < 10:
-            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(p|/?br)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE)
+            chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
            html = chapdetect.sub(self.chapter_head, html)
        if self.html_preprocess_sections < 10:
            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+            chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
            html = chapdetect2.sub(self.chapter_head, html)

        if self.html_preprocess_sections < 10:
            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+            chapdetect2 = re.compile(r'%s' % chapter_marker,  re.UNICODE)
            html = chapdetect2.sub(self.chapter_head, html)
-
        ###### Unwrap lines ######
        #
-        self.log("Unwrapping Lines")
        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
        # that lines can be un-wrapped across page boundaries
@ -168,25 +203,40 @@ class PreProcessor(object):
                format = 'html'
        else:
            format = 'html'
-
+        # Check Line histogram to determine if the document uses hard line breaks, If 50% or
+        # more of the lines break in the same region of the document then unwrapping is required
+        docanalysis = DocAnalysis(format, html)
+        hardbreaks = docanalysis.line_histogram(.50)
+        self.log("Hard line breaks check returned "+str(hardbreaks))
        # Calculate Length
-        length = line_length(format, html, getattr(self.extra_opts,
-            'html_unwrap_factor', 0.4))
+        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
+        length = docanalysis.line_length(unwrap_factor)
        self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
-        max_length = length * 1.4
-        min_max = str("(?<=.{"+str(length)+"})(?<!.{"+str(max_length)+"})")
-        #
-        # Unwrap em/en dashes, delete soft-hyphens
-        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
-        html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
-        html = re.sub(u'%s(?<=[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % min_max, '', html)
-        # Dehyphenate
-        dehyphenator = Dehyphenator()
-        html = dehyphenator(html,'html', length)
+        # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
+        if hardbreaks or unwrap_factor < 0.4:
+            self.log("Unwrapping required, unwrapping Lines")
+            # Unwrap em/en dashes
+            html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
+            # Dehyphenate
+            self.log("Unwrapping/Removing hyphens")
+            dehyphenator = Dehyphenator()
+            html = dehyphenator(html,'html', length)
+            self.log("Done dehyphenating")
+            # Unwrap lines using punctation and line length
+            unwrap = re.compile(u"(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+            html = unwrap.sub(' ', html)
+            #check any remaining hyphens, but only unwrap if there is a match
+            dehyphenator = Dehyphenator()
+            html = dehyphenator(html,'html_cleanup', length)
+        else:
+            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
+            self.log("Cleaning up hyphenation")
+            dehyphenator = Dehyphenator()
+            html = dehyphenator(html,'html_cleanup', length)
+            self.log("Done dehyphenating")

-        # Unwrap lines using punctation and line length
-        unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
-        html = unwrap.sub(' ', html)
+        # delete soft hyphens
+        html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)

        # If still no sections after unwrapping mark split points on lines with no punctuation
        if self.html_preprocess_sections < 10:
--- a/src/calibre/ebooks/metadata/covers.py
+++ b/src/calibre/ebooks/metadata/covers.py
@ -9,6 +9,7 @@ import traceback, socket, re, sys
 from functools import partial
 from threading import Thread, Event
 from Queue import Queue, Empty
+from lxml import etree

 import mechanize

@ -216,6 +217,68 @@ def download_covers(mi, result_queue, max_covers=50, timeout=5.): # {{{

 # }}}

+class DoubanCovers(CoverDownload): # {{{
+    'Download covers from Douban.com'
+
+    DOUBAN_ISBN_URL = 'http://api.douban.com/book/subject/isbn/'
+    CALIBRE_DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
+    name = 'Douban.com covers'
+    description = _('Download covers from Douban.com')
+    author = 'Li Fanxi'
+
+    def get_cover_url(self, isbn, br, timeout=5.):
+        try:
+            url = self.DOUBAN_ISBN_URL + isbn + "?apikey=" + self.CALIBRE_DOUBAN_API_KEY
+            src = br.open(url, timeout=timeout).read()
+        except Exception, err:
+            if isinstance(getattr(err, 'args', [None])[0], socket.timeout):
+                err = Exception(_('Douban.com API timed out. Try again later.'))
+            raise err
+        else:
+            feed = etree.fromstring(src)
+            NAMESPACES = {
+              'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
+              'atom' : 'http://www.w3.org/2005/Atom',
+              'db': 'http://www.douban.com/xmlns/'
+            }
+            XPath = partial(etree.XPath, namespaces=NAMESPACES)
+            entries = XPath('//atom:entry')(feed)
+            if len(entries) < 1:
+                return None
+            try:
+                cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href")
+                u = cover_url(entries[0])[0].replace('/spic/', '/lpic/');
+                # If URL contains "book-default", the book doesn't have a cover
+                if u.find('book-default') != -1:
+                    return None
+            except:
+                return None
+            return u
+
+    def has_cover(self, mi, ans, timeout=5.):
+        if not mi.isbn:
+            return False
+        br = browser()
+        try:
+            if self.get_cover_url(mi.isbn, br, timeout=timeout) != None:
+                self.debug('cover for', mi.isbn, 'found')
+                ans.set()
+        except Exception, e:
+            self.debug(e)
+
+    def get_covers(self, mi, result_queue, abort, timeout=5.):
+        if not mi.isbn:
+            return
+        br = browser()
+        try:
+            url = self.get_cover_url(mi.isbn, br, timeout=timeout)
+            cover_data = br.open_novisit(url).read()
+            result_queue.put((True, cover_data, 'jpg', self.name))
+        except Exception, e:
+            result_queue.put((False, self.exception_to_string(e),
+                traceback.format_exc(), self.name))
+# }}}
+
 def download_cover(mi, timeout=5.): # {{{
    results = Queue()
    download_covers(mi, results, max_covers=1, timeout=timeout)
--- a/src/calibre/ebooks/metadata/meta.py
+++ b/src/calibre/ebooks/metadata/meta.py
@ -181,7 +181,7 @@ def metadata_from_filename(name, pat=None):
            mi.isbn = si
        except (IndexError, ValueError):
            pass
-    if not mi.title:
+    if mi.is_null('title'):
        mi.title = name
    return mi

--- a/src/calibre/gui2/actions/edit_metadata.py
+++ b/src/calibre/gui2/actions/edit_metadata.py
@ -184,7 +184,7 @@ class EditMetadataAction(InterfaceAction):
        self.gui.tags_view.blockSignals(True)
        try:
            changed = MetadataBulkDialog(self.gui, rows,
-                self.gui.library_view.model().db).changed
+                self.gui.library_view.model()).changed
        finally:
            self.gui.tags_view.blockSignals(False)
        if changed:
--- a/src/calibre/gui2/dialogs/metadata_bulk.py
+++ b/src/calibre/gui2/dialogs/metadata_bulk.py
@ -142,12 +142,13 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
                            _('Append to field'),
                        ]

-    def __init__(self, window, rows, db):
+    def __init__(self, window, rows, model):
        QDialog.__init__(self, window)
        Ui_MetadataBulkDialog.__init__(self)
        self.setupUi(self)
-        self.db = db
-        self.ids = [db.id(r) for r in rows]
+        self.model = model
+        self.db = model.db
+        self.ids = [self.db.id(r) for r in rows]
        self.box_title.setText('<p>' +
                _('Editing meta information for <b>%d books</b>') %
                len(rows))
@ -170,7 +171,7 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
        self.tag_editor_button.clicked.connect(self.tag_editor)
        self.autonumber_series.stateChanged[int].connect(self.auto_number_changed)

-        if len(db.custom_field_keys(include_composites=False)) == 0:
+        if len(self.db.custom_field_keys(include_composites=False)) == 0:
            self.central_widget.removeTab(1)
        else:
            self.create_custom_column_editors()
@ -617,8 +618,15 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
        self.worker = Worker(args, self.db, self.ids,
                getattr(self, 'custom_column_widgets', []),
                Dispatcher(bb.accept, parent=bb))
-        self.worker.start()
-        bb.exec_()
+
+        # The metadata backup thread causes database commits
+        # which can slow down bulk editing of large numbers of books
+        self.model.stop_metadata_backup()
+        try:
+            self.worker.start()
+            bb.exec_()
+        finally:
+            self.model.start_metadata_backup()

        if self.worker.error is not None:
            return error_dialog(self, _('Failed'),
--- a/src/calibre/gui2/dialogs/scheduler.py
+++ b/src/calibre/gui2/dialogs/scheduler.py
@ -57,6 +57,10 @@ class SchedulerDialog(QDialog, Ui_Dialog):

        self.old_news.setValue(gconf['oldest_news'])

+    def keyPressEvent(self, ev):
+        if ev.key() not in (Qt.Key_Enter, Qt.Key_Return):
+            return QDialog.keyPressEvent(self, ev)
+
    def break_cycles(self):
        self.disconnect(self.recipe_model,  SIGNAL('searched(PyQt_PyObject)'),
                self.search_done)
--- a/src/calibre/gui2/library/models.py
+++ b/src/calibre/gui2/library/models.py
@ -159,17 +159,24 @@ class BooksModel(QAbstractTableModel): # {{{
            # do something on the GUI thread. Deadlock.
        self.cover_cache = CoverCache(db, FunctionDispatcher(self.db.cover))
        self.cover_cache.start()
-        if self.metadata_backup is not None:
-            self.metadata_backup.stop()
-            # Would like to to a join here, but the thread might be waiting to
-            # do something on the GUI thread. Deadlock.
-        self.metadata_backup = MetadataBackup(db)
-        self.metadata_backup.start()
+        self.stop_metadata_backup()
+        self.start_metadata_backup()
        def refresh_cover(event, ids):
            if event == 'cover' and self.cover_cache is not None:
                self.cover_cache.refresh(ids)
        db.add_listener(refresh_cover)

+    def start_metadata_backup(self):
+        self.metadata_backup = MetadataBackup(self.db)
+        self.metadata_backup.start()
+
+    def stop_metadata_backup(self):
+        if getattr(self, 'metadata_backup', None) is not None:
+            self.metadata_backup.stop()
+            # Would like to to a join here, but the thread might be waiting to
+            # do something on the GUI thread. Deadlock.
+
+
    def refresh_ids(self, ids, current_row=-1):
        rows = self.db.refresh_ids(ids)
        if rows:
--- a/src/calibre/gui2/preferences/misc.py
+++ b/src/calibre/gui2/preferences/misc.py
@ -106,14 +106,13 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
        d.exec_()

    def compact(self, *args):
-        from calibre.library.caches import MetadataBackup
        m = self.gui.library_view.model()
-        if m.metadata_backup is not None:
-            m.metadata_backup.stop()
-        d = CheckIntegrity(m.db, self)
-        d.exec_()
-        m.metadata_backup = MetadataBackup(m.db)
-        m.metadata_backup.start()
+        m.stop_metadata_backup()
+        try:
+            d = CheckIntegrity(m.db, self)
+            d.exec_()
+        finally:
+            m.start_metadata_backup()

    def open_config_dir(self, *args):
        from calibre.utils.config import config_dir
--- a/src/calibre/gui2/tools.py
+++ b/src/calibre/gui2/tools.py
@ -217,9 +217,12 @@ def fetch_scheduled_recipe(arg):
    if 'output_profile' in ps:
        recs.append(('output_profile', ps['output_profile'],
            OptionRecommendation.HIGH))
-        if ps['output_profile'] == 'kindle':
-            recs.append(('no_inline_toc', True,
-                OptionRecommendation.HIGH))
+        # Disabled since apparently some people use
+        # K4PC and, surprise, surprise, it doesn't support
+        # indexed MOBIs.
+        #if ps['output_profile'] == 'kindle':
+        #    recs.append(('no_inline_toc', True,
+        #        OptionRecommendation.HIGH))

    lf = load_defaults('look_and_feel')
    if lf.get('base_font_size', 0.0) != 0.0:
--- a/src/calibre/gui2/wizard/init.py
+++ b/src/calibre/gui2/wizard/init.py
@ -73,6 +73,14 @@ class JetBook(Device):
    manufacturer = 'Ectaco'
    id = 'jetbook'

+class JetBookMini(Device):
+
+    output_profile = 'jetbook5'
+    output_format  = 'FB2'
+    name = 'JetBook Mini'
+    manufacturer = 'Ectaco'
+    id = 'jetbookmini'
+
 class KindleDX(Kindle):

    output_profile = 'kindle_dx'
@ -584,12 +592,42 @@ class LibraryPage(QWizardPage, LibraryUI):
        qt_app.load_translations()
        self.emit(SIGNAL('retranslate()'))
        self.init_languages()
+        try:
+            if prefs['language'].lower().startswith('zh'):
+                from calibre.customize.ui import enable_plugin
+                for name in ('Douban Books', 'Douban.com covers'):
+                    enable_plugin(name)
+        except:
+            pass
+
+    def is_library_dir_suitable(self, x):
+        return LibraryDatabase2.exists_at(x) or not os.listdir(x)
+
+    def validatePage(self):
+        newloc = unicode(self.location.text())
+        if not self.is_library_dir_suitable(newloc):
+            self.show_library_dir_error(newloc)
+            return False
+        return True

    def change(self):
-        dir = choose_dir(self, 'database location dialog',
+        x = choose_dir(self, 'database location dialog',
                         _('Select location for books'))
-        if dir:
-            self.location.setText(dir)
+        if x:
+            if self.is_library_dir_suitable(x):
+                self.location.setText(x)
+            else:
+                self.show_library_dir_error(x)
+
+    def show_library_dir_error(self, x):
+        if not isinstance(x, unicode):
+            try:
+                x = x.decode(filesystem_encoding)
+            except:
+                x = unicode(repr(x))
+        error_dialog(self, _('Bad location'),
+            _('You must choose an empty folder for '
+                'the calibre library. %s is not empty.')%x, show=True)

    def initializePage(self):
        lp = prefs['library_path']