merge from trunk

2025-08-30 23:00:21 -04:00 · 2010-09-29 07:42:23 +01:00 · 2010-09-29 07:42:23 +01:00 · 9c3d85d4a5
commit 9c3d85d4a5
parent 8a94c2194e 3018b6ac7c
21 changed files with 503 additions and 174 deletions
--- a/resources/recipes/ajc.recipe
+++ b/resources/recipes/ajc.recipe
@ -9,6 +9,8 @@ __docformat__ = 'restructuredtext en'
 import datetime
 from calibre.web.feeds.news import BasicNewsRecipe
 class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    now = datetime.datetime.now()
    title = 'The AJC'
--- a/resources/recipes/boortz.recipe
+++ b/resources/recipes/boortz.recipe
@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
+
 class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    title = 'Nealz Nuze'
    language = 'en'
--- a/resources/recipes/popscience.recipe
+++ b/resources/recipes/popscience.recipe
@ -1,5 +1,5 @@
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import re
+import re
 class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    title = 'Popular Science'
--- a/resources/recipes/telegraph_uk.recipe
+++ b/resources/recipes/telegraph_uk.recipe
@ -1,6 +1,5 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
-__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
 telegraph.co.uk
 '''
@ -8,14 +7,16 @@ telegraph.co.uk
 from calibre.web.feeds.news import BasicNewsRecipe
 class TelegraphUK(BasicNewsRecipe):
-    title                 = u'Telegraph.co.uk'
+    title                 = 'Telegraph.co.uk'
    __author__            = 'Darko Miletic and Sujata Raman'
    description           = 'News from United Kingdom'
-    oldest_article        = 7
+    oldest_article        = 2
    category              = 'news, politics, UK'
    publisher             = 'Telegraph Media Group ltd.'    
    max_articles_per_feed = 100
    no_stylesheets        = True
-    language = 'en'
+    language              = 'en_GB'
-
+    remove_empty_feeds    = True
    use_embedded_content  = False
    extra_css           = '''
@ -27,13 +28,20 @@ class TelegraphUK(BasicNewsRecipe):
                        .imageExtras{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
                        '''
    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        }
    keep_only_tags      = [
-                           dict(name='div', attrs={'class':'storyHead'})
+                           dict(name='div', attrs={'class':['storyHead','byline']})
-                          ,dict(name='div', attrs={'class':'story'    })
+                          ,dict(name='div', attrs={'id':'mainBodyArea'           })
                          #,dict(name='div', attrs={'class':['slideshowHD gutterUnder',"twoThirds gutter","caption" ]   })
                          ]
-    remove_tags         = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide']})
+    remove_tags         = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide','related_links_video']})
-                          #,dict(name='div', attrs={'class':['toolshideoneQuarter']})
+                          ,dict(name='ul' , attrs={'class':['shareThis shareBottom']})
                          ,dict(name='span', attrs={'class':['num','placeComment']})
                          ]
@ -51,24 +59,7 @@ class TelegraphUK(BasicNewsRecipe):
                         ]
    def get_article_url(self, article):
-
+        url = article.get('link', None)
        url = article.get('guid', None)
        if 'picture-galleries' in url or 'pictures' in url or 'picturegalleries' in url :
            url = None
        return url
    def postprocess_html(self,soup,first):
        for bylineTag in soup.findAll(name='div', attrs={'class':'byline'}):
            for pTag in bylineTag.findAll(name='p'):
                if getattr(pTag.contents[0],"Comments",True):
                    pTag.extract()
        return soup
--- a/resources/recipes/twtfb.recipe
+++ b/resources/recipes/twtfb.recipe
@ -0,0 +1,40 @@
 __license__   = 'GPL v3'
 __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
 www.thewaythefutureblogs.com
 Frederik Pohl's Blog
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class TheWayTheFutureBlogs(BasicNewsRecipe):
    title                 = 'The Way the Future Blogs'
    __author__            = 'Darko Miletic'
    description           = "Frederik Pohl's blog"
    publisher             = 'Frederik Pohl'
    category              = 'news, SF, books'
    oldest_article        = 30
    max_articles_per_feed = 200
    no_stylesheets        = True
    encoding              = 'utf8'
    use_embedded_content  = False
    language              = 'en'
    remove_empty_feeds    = True
    extra_css             = ' body{font-family: Georgia,serif } '
    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        }
    remove_tags   =[dict(name=['meta','object','embed','iframe','base','link'])]
    keep_only_tags=[dict(attrs={'class':['post','commentlist']})]
    remove_attributes=['width','height','lang','border']
    feeds = [(u'Posts', u'http://www.thewaythefutureblogs.com/feed/')]
    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        return soup
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -446,7 +446,7 @@ from calibre.devices.eb600.driver import EB600, COOL_ER, SHINEBOOK, \
                BOOQ, ELONEX, POCKETBOOK301, MENTOR
 from calibre.devices.iliad.driver import ILIAD
 from calibre.devices.irexdr.driver import IREXDR1000, IREXDR800
-from calibre.devices.jetbook.driver import JETBOOK, MIBUK
+from calibre.devices.jetbook.driver import JETBOOK, MIBUK, JETBOOK_MINI
 from calibre.devices.kindle.driver import KINDLE, KINDLE2, KINDLE_DX
 from calibre.devices.nook.driver import NOOK
 from calibre.devices.prs505.driver import PRS505
@ -468,14 +468,14 @@ from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \
    LibraryThing
 from calibre.ebooks.metadata.douban import DoubanBooks
 from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
-        LibraryThingCovers
+        LibraryThingCovers, DoubanCovers
 from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX
 from calibre.ebooks.epub.fix.unmanifested import Unmanifested
 from calibre.ebooks.epub.fix.epubcheck import Epubcheck
 plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon,
        LibraryThing, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
-        Epubcheck, OpenLibraryCovers, LibraryThingCovers]
+        Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers]
 plugins += [
    ComicInput,
    EPUBInput,
@ -520,6 +520,7 @@ plugins += [
    IREXDR1000,
    IREXDR800,
    JETBOOK,
    JETBOOK_MINI,
    MIBUK,
    SHINEBOOK,
    POCKETBOOK360,
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@ -1,3 +1,4 @@
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import with_statement
 __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
@ -251,6 +252,9 @@ class OutputProfile(Plugin):
    #: The character used to represent a star in ratings
    ratings_char = u'*'
    #: Unsupported unicode characters to be replaced during preprocessing
    unsupported_unicode_chars = []
    @classmethod
    def tags_to_string(cls, tags):
        return escape(', '.join(tags))
@ -422,6 +426,8 @@ class SonyReaderOutput(OutputProfile):
    dpi                       = 168.451
    fbase                     = 12
    fsizes                    = [7.5, 9, 10, 12, 15.5, 20, 22, 24]
    unsupported_unicode_chars = [u'\u201f', u'\u201b']
 class KoboReaderOutput(OutputProfile):
--- a/src/calibre/customize/ui.py
+++ b/src/calibre/customize/ui.py
@ -120,7 +120,7 @@ def enable_plugin(plugin_or_name):
    config['enabled_plugins'] = ep
 default_disabled_plugins = set([
-    'Douban Books',
+    'Douban Books', 'Douban.com covers',
 ])
 def is_disabled(plugin):
--- a/src/calibre/devices/init.py
+++ b/src/calibre/devices/init.py
@ -56,6 +56,7 @@ def get_connected_device():
    return dev
 def debug(ioreg_to_tmp=False, buf=None):
    import textwrap
    from calibre.customize.ui import device_plugins
    from calibre.devices.scanner import DeviceScanner, win_pnp_drives
    from calibre.constants import iswindows, isosx, __version__
@ -95,13 +96,19 @@ def debug(ioreg_to_tmp=False, buf=None):
            ioreg += 'Output from osx_get_usb_drives:\n'+drives+'\n\n'
            ioreg += Device.run_ioreg()
        connected_devices = []
-        for dev in sorted(device_plugins(), cmp=lambda
+        devplugins = list(sorted(device_plugins(), cmp=lambda
-                x,y:cmp(x.__class__.__name__, y.__class__.__name__)):
+                x,y:cmp(x.__class__.__name__, y.__class__.__name__)))
-            out('Looking for', dev.__class__.__name__)
+        out('Available plugins:', textwrap.fill(' '.join([x.__class__.__name__ for x in
            devplugins])))
        out(' ')
        out('Looking for devices...')
        for dev in devplugins:
            connected, det = s.is_device_connected(dev, debug=True)
            if connected:
                out('\t\tDetected possible device', dev.__class__.__name__)
                connected_devices.append((dev, det))
        out(' ')
        errors = {}
        success = False
        out('Devices possibly connected:', end=' ')
--- a/src/calibre/devices/jetbook/driver.py
+++ b/src/calibre/devices/jetbook/driver.py
@ -99,4 +99,30 @@ class MIBUK(USBMS):
    VENDOR_NAME      = 'LINUX'
    WINDOWS_MAIN_MEM = 'WOLDERMIBUK'
 class JETBOOK_MINI(USBMS):
    '''
    ['0x4b8',
  '0x507',
  '0x100',
  'ECTACO',
  'ECTACO ATA/ATAPI Bridge (Bulk-Only)',
  'Rev.0.20']
    '''
    FORMATS     = ['fb2', 'txt']
    gui_name = 'JetBook Mini'
    name = 'JetBook Mini Device Interface'
    description    = _('Communicate with the JetBook Mini reader.')
    author         = 'Kovid Goyal'
    VENDOR_ID = [0x4b8]
    PRODUCT_ID = [0x507]
    BCD = [0x100]
    VENDOR_NAME      = 'ECTACO'
    WINDOWS_MAIN_MEM = '' # Matches PROD_
    MAIN_MEMORY_VOLUME_LABEL  = 'Jetbook Mini'
    SUPPORTS_SUB_DIRS = True
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -62,25 +62,34 @@ def wrap_lines(match):
    else:
               return ital+' '
-def line_length(format, raw, percent):
+class DocAnalysis(object):
    '''
-    raw is the raw text to find the line length to use for wrapping.
+    Provides various text analysis functions to determine how the document is structured.
    format is the type of document analysis will be done against.
    raw is the raw text to determine the line length to use for wrapping.
    Blank lines are excluded from analysis
    '''
    def __init__(self, format='html', raw=''):
        raw = raw.replace('&nbsp;', ' ')
        if format == 'html':
            linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
        elif format == 'pdf':
            linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
        elif format == 'spanned_html':
            linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
        self.lines = linere.findall(raw)
    def line_length(self, percent):
        '''
        Analyses the document to find the median line length.
        percentage is a decimal number, 0 - 1 which is used to determine
        how far in the list of line lengths to use. The list of line lengths is
        ordered smallest to larged and does not include duplicates. 0.5 is the
        median value.
        '''
    raw = raw.replace('&nbsp;', ' ')
    if format == 'html':
        linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
    elif format == 'pdf':
        linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
    elif format == 'spanned_html':
        linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
    lines = linere.findall(raw)
        lengths = []
-    for line in lines:
+        for line in self.lines:
            if len(line) > 0:
                lengths.append(len(line))
@ -106,6 +115,52 @@ def line_length(format, raw, percent):
        return lengths[index]
    def line_histogram(self, percent):
        '''
        Creates a broad histogram of the document to determine whether it incorporates hard
        line breaks.  Lines are sorted into 20 'buckets' based on length.
        percent is the percentage of lines that should be in a single bucket to return true
        The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
        '''
        minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
        maxLineLength=1900 # Discard larger than this to stay in range
        buckets=20 # Each line is divided into a bucket based on length
        #print "there are "+str(len(lines))+" lines"
        #max = 0
        #for line in self.lines:
        #    l = len(line)
        #    if l > max:
        #        max = l
        #print "max line found is "+str(max)
        # Build the line length histogram
        hRaw = [ 0 for i in range(0,buckets) ]
        for line in self.lines:
            l = len(line)
            if l > minLineLength and l < maxLineLength:
                    l = int(l/100)
                    #print "adding "+str(l)
                    hRaw[l]+=1
        # Normalize the histogram into percents
        totalLines = len(self.lines)
        h = [ float(count)/totalLines for count in hRaw ]
        #print "\nhRaw histogram lengths are: "+str(hRaw)
        #print "              percents are: "+str(h)+"\n"
        # Find the biggest bucket
        maxValue = 0
        for i in range(0,len(h)):
            if h[i] > maxValue:
                maxValue = h[i]
        if maxValue < percent:
            #print "Line lengths are too variable. Not unwrapping."
            return False
        else:
            #print str(maxValue)+" of the lines were in one bucket"
            return True
 class Dehyphenator(object):
    '''
    Analyzes words to determine whether hyphens should be retained/removed.  Uses the document
@ -117,23 +172,41 @@ class Dehyphenator(object):
    def __init__(self):
        # Add common suffixes to the regex below to increase the likelihood of a match -
        # don't add suffixes which are also complete words, such as 'able' or 'sex'
-        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
        # remove prefixes if the prefix was not already the point of hyphenation
-        self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
+        self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
-        self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
+        self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
    def dehyphenate(self, match):
        firsthalf = match.group('firstpart')
        secondhalf = match.group('secondpart')
        try:
            wraptags = match.group('wraptags')
        except:
            wraptags = ''
        hyphenated = str(firsthalf) + "-" + str(secondhalf)
        dehyphenated = str(firsthalf) + str(secondhalf)
        lookupword = self.removesuffixes.sub('', dehyphenated)
        if self.prefixes.match(firsthalf) is None:
           lookupword = self.removeprefix.sub('', lookupword)
        booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
-        match = booklookup.search(self.html)
+        try:
-        if match:
+            searchresult = self.html.find(str.lower(lookupword))
        except:
            return hyphenated
        if self.format == 'html_cleanup':
            if self.html.find(lookupword) != -1 or searchresult != -1:
                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
                return dehyphenated
            elif self.html.find(hyphenated) != -1:
                #print "Cleanup:returned hyphenated word: " + str(hyphenated)
                return hyphenated
            else:
                #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
                return firsthalf+u'\u2014'+wraptags+secondhalf
        else:
            if self.html.find(lookupword) != -1 or searchresult != -1:
                #print "returned dehyphenated word: " + str(dehyphenated)
                return dehyphenated
            else:
@ -142,17 +215,19 @@ class Dehyphenator(object):
    def __call__(self, html, format, length=1):
        self.html = html
        self.format = format
        if format == 'html':
-            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
        elif format == 'pdf':
-            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(<p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
        elif format == 'individual_words':
-            intextmatch = re.compile('>[^<]*\b(?P<firstpart>[^"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
+            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
        elif format == 'html_cleanup':
            intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
        html = intextmatch.sub(self.dehyphenate, html)
        return html
 class CSSPreProcessor(object):
    PAGE_PAT   = re.compile(r'@page[^{]*?{[^}]*?}')
@ -286,7 +361,7 @@ class HTMLPreProcessor(object):
                  (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
                  # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
+                  (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
                  # Cover the case where every letter in a chapter title is separated by a space
                  (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
@ -374,10 +449,8 @@ class HTMLPreProcessor(object):
                print 'Failed to parse remove_footer regexp'
                traceback.print_exc()
-        # unwrap em/en dashes, delete soft hyphens - moved here so it's executed after header/footer removal
+        # delete soft hyphens - moved here so it's executed after header/footer removal
        if is_pdftohtml:
            # unwrap em/en dashes
            end_rules.append((re.compile(u'(?<=[–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
            # unwrap/delete soft hyphens
            end_rules.append((re.compile(u'[](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
            # unwrap/delete soft hyphens with formatting
@ -391,12 +464,15 @@ class HTMLPreProcessor(object):
        length = -1
        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
-            length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
+            docanalysis = DocAnalysis('pdf', html)
            length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
            if length:
                #print "The pdf line length returned is " + str(length)
                # unwrap em/en dashes
                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
                end_rules.append(
                    # Un wrap using punctuation
-                    (re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                )
        for rule in self.PREPROCESS + start_rules:
@ -454,6 +530,14 @@ class HTMLPreProcessor(object):
        if getattr(self.extra_opts, 'smarten_punctuation', False):
            html = self.smarten_punctuation(html)
        unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
        if unsupported_unicode_chars:
            from calibre.ebooks.unidecode.unidecoder import Unidecoder
            unidecoder = Unidecoder()
            for char in unsupported_unicode_chars:
                asciichar = unidecoder.decode(char)
                html = html.replace(char, asciichar)
        return html
    def smarten_punctuation(self, html):
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import re
-from calibre.ebooks.conversion.preprocess import line_length, Dehyphenator
+from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.utils.logging import default_log
 class PreProcessor(object):
@ -77,13 +77,18 @@ class PreProcessor(object):
    def __call__(self, html):
        self.log("*********  Preprocessing HTML  *********")
        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
        html = re.sub(r"\s*</p>", "</p>\n", html)
        html = re.sub(r"\s*<p>\s*", "\n<p>", html)
        ###### Check Markup ######
        #
        # some lit files don't have any <p> tags or equivalent (generally just plain text between
        # <pre> tags), check and  mark up line endings if required before proceeding
        if self.no_markup(html, 0.1):
             self.log("not enough paragraph markers, adding now")
-             # check if content is in pre tags, use txt procesor to mark up if so
+             # check if content is in pre tags, use txt processor to mark up if so
             pre = re.compile(r'<pre>', re.IGNORECASE)
             if len(pre.findall(html)) == 1:
                 self.log("Running Text Processing")
@ -113,47 +118,77 @@ class PreProcessor(object):
        # Get rid of empty <o:p> tags to simplify other processing
        html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
        # Get rid of empty span, bold, & italics tags
-        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
+        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
        html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
-        # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+        # If more than 40% of the lines are empty paragraphs and the user has enabled remove
        # paragraph spacing then delete blank lines to clean up spacing
        linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
        blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
        #multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
        blanklines = blankreg.findall(html)
        lines = linereg.findall(html)
        blanks_between_paragraphs = False
        if len(lines) > 1:
            self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
            if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
            'remove_paragraph_spacing', False):
                self.log("deleting blank lines")
                html = blankreg.sub('', html)
-        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
+            elif float(len(blanklines)) / float(len(lines)) > 0.40:
-        html = re.sub(r"\s*</p>", "</p>\n", html)
+               blanks_between_paragraphs = True
-        html = re.sub(r"\s*<p>\s*", "\n<p>", html)
+               #print "blanks between paragraphs is marked True"
            else:
                blanks_between_paragraphs = False
        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
        # detect chapters/sections to match xpath or splitting logic
        #
        # Build the Regular Expressions in pieces
        lookahead = "(?=<(p|div))"
        chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
        chapter_header_open = r"(?P<chap>"
        chapter_header_close = ")\s*"
        chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)\s[^>]*>)?\s*</(?P=outer)>\s*"
        if blanks_between_paragraphs:
            blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
        else:
            blank_lines = ""
        opt_title_open = "("
        title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
        title_header_open = "(?P<title>"
        title_header_close = ")\s*"
        title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"
        opt_title_close = ")?"
        default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
        typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
        numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
        uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
        chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
        #print chapter_marker
        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
        self.html_preprocess_sections = len(heading.findall(html))
        self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
        #
        # Start with most typical chapter headings, get more aggressive until one works
        if self.html_preprocess_sections < 10:
-            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(p|/?br)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE)
+            chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
            html = chapdetect.sub(self.chapter_head, html)
        if self.html_preprocess_sections < 10:
            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
            chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
            html = chapdetect2.sub(self.chapter_head, html)
        if self.html_preprocess_sections < 10:
            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
            chapdetect2 = re.compile(r'%s' % chapter_marker,  re.UNICODE)
            html = chapdetect2.sub(self.chapter_head, html)
        ###### Unwrap lines ######
        #
        self.log("Unwrapping Lines")
        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
        # that lines can be un-wrapped across page boundaries
@ -168,25 +203,40 @@ class PreProcessor(object):
                format = 'html'
        else:
            format = 'html'
-
+        # Check Line histogram to determine if the document uses hard line breaks, If 50% or
        # more of the lines break in the same region of the document then unwrapping is required
        docanalysis = DocAnalysis(format, html)
        hardbreaks = docanalysis.line_histogram(.50)
        self.log("Hard line breaks check returned "+str(hardbreaks))
        # Calculate Length
-        length = line_length(format, html, getattr(self.extra_opts,
+        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
-            'html_unwrap_factor', 0.4))
+        length = docanalysis.line_length(unwrap_factor)
        self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
-        max_length = length * 1.4
+        # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
-        min_max = str("(?<=.{"+str(length)+"})(?<!.{"+str(max_length)+"})")
+        if hardbreaks or unwrap_factor < 0.4:
-        #
+            self.log("Unwrapping required, unwrapping Lines")
-        # Unwrap em/en dashes, delete soft-hyphens
+            # Unwrap em/en dashes
-        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
+            html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
        html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
        html = re.sub(u'%s(?<=[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % min_max, '', html)
            # Dehyphenate
            self.log("Unwrapping/Removing hyphens")
            dehyphenator = Dehyphenator()
            html = dehyphenator(html,'html', length)
-
+            self.log("Done dehyphenating")
            # Unwrap lines using punctation and line length
-        unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+            unwrap = re.compile(u"(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
            html = unwrap.sub(' ', html)
            #check any remaining hyphens, but only unwrap if there is a match
            dehyphenator = Dehyphenator()
            html = dehyphenator(html,'html_cleanup', length)
        else:
            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
            self.log("Cleaning up hyphenation")
            dehyphenator = Dehyphenator()
            html = dehyphenator(html,'html_cleanup', length)
            self.log("Done dehyphenating")
        # delete soft hyphens
        html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
        # If still no sections after unwrapping mark split points on lines with no punctuation
        if self.html_preprocess_sections < 10:
--- a/src/calibre/ebooks/metadata/covers.py
+++ b/src/calibre/ebooks/metadata/covers.py
@ -9,6 +9,7 @@ import traceback, socket, re, sys
 from functools import partial
 from threading import Thread, Event
 from Queue import Queue, Empty
 from lxml import etree
 import mechanize
@ -216,6 +217,68 @@ def download_covers(mi, result_queue, max_covers=50, timeout=5.): # {{{
 # }}}
 class DoubanCovers(CoverDownload): # {{{
    'Download covers from Douban.com'
    DOUBAN_ISBN_URL = 'http://api.douban.com/book/subject/isbn/'
    CALIBRE_DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
    name = 'Douban.com covers'
    description = _('Download covers from Douban.com')
    author = 'Li Fanxi'
    def get_cover_url(self, isbn, br, timeout=5.):
        try:
            url = self.DOUBAN_ISBN_URL + isbn + "?apikey=" + self.CALIBRE_DOUBAN_API_KEY
            src = br.open(url, timeout=timeout).read()
        except Exception, err:
            if isinstance(getattr(err, 'args', [None])[0], socket.timeout):
                err = Exception(_('Douban.com API timed out. Try again later.'))
            raise err
        else:
            feed = etree.fromstring(src)
            NAMESPACES = {
              'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
              'atom' : 'http://www.w3.org/2005/Atom',
              'db': 'http://www.douban.com/xmlns/'
            }
            XPath = partial(etree.XPath, namespaces=NAMESPACES)
            entries = XPath('//atom:entry')(feed)
            if len(entries) < 1:
                return None
            try:
                cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href")
                u = cover_url(entries[0])[0].replace('/spic/', '/lpic/');
                # If URL contains "book-default", the book doesn't have a cover
                if u.find('book-default') != -1:
                    return None
            except:
                return None
            return u
    def has_cover(self, mi, ans, timeout=5.):
        if not mi.isbn:
            return False
        br = browser()
        try:
            if self.get_cover_url(mi.isbn, br, timeout=timeout) != None:
                self.debug('cover for', mi.isbn, 'found')
                ans.set()
        except Exception, e:
            self.debug(e)
    def get_covers(self, mi, result_queue, abort, timeout=5.):
        if not mi.isbn:
            return
        br = browser()
        try:
            url = self.get_cover_url(mi.isbn, br, timeout=timeout)
            cover_data = br.open_novisit(url).read()
            result_queue.put((True, cover_data, 'jpg', self.name))
        except Exception, e:
            result_queue.put((False, self.exception_to_string(e),
                traceback.format_exc(), self.name))
 # }}}
 def download_cover(mi, timeout=5.): # {{{
    results = Queue()
    download_covers(mi, results, max_covers=1, timeout=timeout)
--- a/src/calibre/ebooks/metadata/meta.py
+++ b/src/calibre/ebooks/metadata/meta.py
@ -181,7 +181,7 @@ def metadata_from_filename(name, pat=None):
            mi.isbn = si
        except (IndexError, ValueError):
            pass
-    if not mi.title:
+    if mi.is_null('title'):
        mi.title = name
    return mi
--- a/src/calibre/gui2/actions/edit_metadata.py
+++ b/src/calibre/gui2/actions/edit_metadata.py
@ -184,7 +184,7 @@ class EditMetadataAction(InterfaceAction):
        self.gui.tags_view.blockSignals(True)
        try:
            changed = MetadataBulkDialog(self.gui, rows,
-                self.gui.library_view.model().db).changed
+                self.gui.library_view.model()).changed
        finally:
            self.gui.tags_view.blockSignals(False)
        if changed:
--- a/src/calibre/gui2/dialogs/metadata_bulk.py
+++ b/src/calibre/gui2/dialogs/metadata_bulk.py
@ -142,12 +142,13 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
                            _('Append to field'),
                        ]
-    def __init__(self, window, rows, db):
+    def __init__(self, window, rows, model):
        QDialog.__init__(self, window)
        Ui_MetadataBulkDialog.__init__(self)
        self.setupUi(self)
-        self.db = db
+        self.model = model
-        self.ids = [db.id(r) for r in rows]
+        self.db = model.db
        self.ids = [self.db.id(r) for r in rows]
        self.box_title.setText('<p>' +
                _('Editing meta information for <b>%d books</b>') %
                len(rows))
@ -170,7 +171,7 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
        self.tag_editor_button.clicked.connect(self.tag_editor)
        self.autonumber_series.stateChanged[int].connect(self.auto_number_changed)
-        if len(db.custom_field_keys(include_composites=False)) == 0:
+        if len(self.db.custom_field_keys(include_composites=False)) == 0:
            self.central_widget.removeTab(1)
        else:
            self.create_custom_column_editors()
@ -617,8 +618,15 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
        self.worker = Worker(args, self.db, self.ids,
                getattr(self, 'custom_column_widgets', []),
                Dispatcher(bb.accept, parent=bb))
        # The metadata backup thread causes database commits
        # which can slow down bulk editing of large numbers of books
        self.model.stop_metadata_backup()
        try:
            self.worker.start()
            bb.exec_()
        finally:
            self.model.start_metadata_backup()
        if self.worker.error is not None:
            return error_dialog(self, _('Failed'),
--- a/src/calibre/gui2/dialogs/scheduler.py
+++ b/src/calibre/gui2/dialogs/scheduler.py
@ -57,6 +57,10 @@ class SchedulerDialog(QDialog, Ui_Dialog):
        self.old_news.setValue(gconf['oldest_news'])
    def keyPressEvent(self, ev):
        if ev.key() not in (Qt.Key_Enter, Qt.Key_Return):
            return QDialog.keyPressEvent(self, ev)
    def break_cycles(self):
        self.disconnect(self.recipe_model,  SIGNAL('searched(PyQt_PyObject)'),
                self.search_done)
--- a/src/calibre/gui2/library/models.py
+++ b/src/calibre/gui2/library/models.py
@ -159,17 +159,24 @@ class BooksModel(QAbstractTableModel): # {{{
            # do something on the GUI thread. Deadlock.
        self.cover_cache = CoverCache(db, FunctionDispatcher(self.db.cover))
        self.cover_cache.start()
-        if self.metadata_backup is not None:
+        self.stop_metadata_backup()
-            self.metadata_backup.stop()
+        self.start_metadata_backup()
            # Would like to to a join here, but the thread might be waiting to
            # do something on the GUI thread. Deadlock.
        self.metadata_backup = MetadataBackup(db)
        self.metadata_backup.start()
        def refresh_cover(event, ids):
            if event == 'cover' and self.cover_cache is not None:
                self.cover_cache.refresh(ids)
        db.add_listener(refresh_cover)
    def start_metadata_backup(self):
        self.metadata_backup = MetadataBackup(self.db)
        self.metadata_backup.start()
    def stop_metadata_backup(self):
        if getattr(self, 'metadata_backup', None) is not None:
            self.metadata_backup.stop()
            # Would like to to a join here, but the thread might be waiting to
            # do something on the GUI thread. Deadlock.
    def refresh_ids(self, ids, current_row=-1):
        rows = self.db.refresh_ids(ids)
        if rows:
--- a/src/calibre/gui2/preferences/misc.py
+++ b/src/calibre/gui2/preferences/misc.py
@ -106,14 +106,13 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
        d.exec_()
    def compact(self, *args):
        from calibre.library.caches import MetadataBackup
        m = self.gui.library_view.model()
-        if m.metadata_backup is not None:
+        m.stop_metadata_backup()
-            m.metadata_backup.stop()
+        try:
            d = CheckIntegrity(m.db, self)
            d.exec_()
-        m.metadata_backup = MetadataBackup(m.db)
+        finally:
-        m.metadata_backup.start()
+            m.start_metadata_backup()
    def open_config_dir(self, *args):
        from calibre.utils.config import config_dir
--- a/src/calibre/gui2/tools.py
+++ b/src/calibre/gui2/tools.py
@ -217,9 +217,12 @@ def fetch_scheduled_recipe(arg):
    if 'output_profile' in ps:
        recs.append(('output_profile', ps['output_profile'],
            OptionRecommendation.HIGH))
-        if ps['output_profile'] == 'kindle':
+        # Disabled since apparently some people use
-            recs.append(('no_inline_toc', True,
+        # K4PC and, surprise, surprise, it doesn't support
-                OptionRecommendation.HIGH))
+        # indexed MOBIs.
        #if ps['output_profile'] == 'kindle':
        #    recs.append(('no_inline_toc', True,
        #        OptionRecommendation.HIGH))
    lf = load_defaults('look_and_feel')
    if lf.get('base_font_size', 0.0) != 0.0:
--- a/src/calibre/gui2/wizard/init.py
+++ b/src/calibre/gui2/wizard/init.py
@ -73,6 +73,14 @@ class JetBook(Device):
    manufacturer = 'Ectaco'
    id = 'jetbook'
 class JetBookMini(Device):
    output_profile = 'jetbook5'
    output_format  = 'FB2'
    name = 'JetBook Mini'
    manufacturer = 'Ectaco'
    id = 'jetbookmini'
 class KindleDX(Kindle):
    output_profile = 'kindle_dx'
@ -584,12 +592,42 @@ class LibraryPage(QWizardPage, LibraryUI):
        qt_app.load_translations()
        self.emit(SIGNAL('retranslate()'))
        self.init_languages()
        try:
            if prefs['language'].lower().startswith('zh'):
                from calibre.customize.ui import enable_plugin
                for name in ('Douban Books', 'Douban.com covers'):
                    enable_plugin(name)
        except:
            pass
    def is_library_dir_suitable(self, x):
        return LibraryDatabase2.exists_at(x) or not os.listdir(x)
    def validatePage(self):
        newloc = unicode(self.location.text())
        if not self.is_library_dir_suitable(newloc):
            self.show_library_dir_error(newloc)
            return False
        return True
    def change(self):
-        dir = choose_dir(self, 'database location dialog',
+        x = choose_dir(self, 'database location dialog',
                         _('Select location for books'))
-        if dir:
+        if x:
-            self.location.setText(dir)
+            if self.is_library_dir_suitable(x):
                self.location.setText(x)
            else:
                self.show_library_dir_error(x)
    def show_library_dir_error(self, x):
        if not isinstance(x, unicode):
            try:
                x = x.decode(filesystem_encoding)
            except:
                x = unicode(repr(x))
        error_dialog(self, _('Bad location'),
            _('You must choose an empty folder for '
                'the calibre library. %s is not empty.')%x, show=True)
    def initializePage(self):
        lp = prefs['library_path']