0.8.45+

2025-07-09 03:04:10 -04:00 · 2012-04-02 04:11:11 -06:00 · 2012-04-02 04:11:11 -06:00 · bf2850019e
commit bf2850019e
parent 4eaf86efcc d24e8e842e
26 changed files with 776 additions and 337 deletions
--- a/recipes/high_country_news.recipe
+++ b/recipes/high_country_news.recipe
@ -13,7 +13,7 @@ class HighCountryNews(BasicNewsRecipe):
    __author__            = 'Armin Geller' # 2012-01-31
    publisher             = 'High Country News'
    timefmt               = ' [%a, %d %b %Y]'
-    language              = 'en-Us'
+    language              = 'en'
    encoding              = 'UTF-8'
    publication_type      = 'newspaper'
    oldest_article        = 7
--- a/recipes/oreilly_premium.recipe
+++ b/recipes/oreilly_premium.recipe
@ -1,45 +1,73 @@
-# Talking Points is not grabbing everything.
+import string, re
 # The look is right, but only the last one added?
 import re
 import time
 import traceback
 # above for debugging via stack
 from calibre.web.feeds.recipes import BasicNewsRecipe
 # Allows the Python soup converter, which makes parsing easier.
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
-# strip ads and graphics
+
-# Current Column lacks a title.
+import os, time, traceback, re, urlparse, sys, cStringIO
-# Talking Points Memo - shorten title - Remove year and Bill's name
+from collections import defaultdict
 from functools import partial
 from contextlib import nested, closing
 from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
 from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
 # To Do: strip ads and graphics, Current Column lacks a title.
 # The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
 # Newsletters: Talking Points Memos covered by cat12 
 # ./ebook-convert  --username xxx --password xxx
 # this is derived from BasicNewsRecipe, so it can only overload those.  
 # Soome of what we need is otherwise in article, so we have more copy to do than otherwise.
 class OReillyPremium(BasicNewsRecipe):
    title           = u'OReilly Premium'
    __author__      = 'TMcN'
    language = 'en'
    description     = 'Retrieves Premium and News Letter content from BillOReilly.com.  Requires a Bill OReilly Premium Membership.'
    cover_url       = 'http://images.billoreilly.com/images/headers/billgray_header.png'
    custom_title    = 'Bill O\'Reilly Premium - '+ time.strftime('%d %b %Y')
    title           = 'Bill O\'Reilly Premium'
    auto_cleanup    = True
    conversion_options = {'linearize_tables': True}
    encoding        = 'utf8'
-    needs_subscription = True
+    language        = 'en'
    no_stylesheets  = True
-    oldest_article  = 20
+    needs_subscription = True
    oldest_article  = 31
    remove_javascript = True
    remove_tags     = [dict(name='img', attrs={})]
    # Don't go down
    recursions      = 0
-    max_articles_per_feed = 2000
+    max_articles_per_feed = 20
    debugMessages   = True
    # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
    catList = [ ["TV Archives",         'https://www.billoreilly.com/show?action=tvShowArchive', 'a',    {'class':['showLinks','homeLinks']},                   []],
-                ["No Spin Archives",    'https://www.billoreilly.com/blog?categoryID=7',         True,   {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
+                # ["No Spin Archives",    'https://www.billoreilly.com/blog?categoryID=7',         True,   {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
-                ["Daily Briefings",     'http://www.billoreilly.com/blog?categoryID=11',         True,   {'class':['defaultHeaderSmallLinks']},                 []],
+                # ["Daily Briefings",     'http://www.billoreilly.com/blog?categoryID=11',         True,   {'class':['defaultHeaderSmallLinks']},                 []],
-                ["Stratfor",            'http://www.billoreilly.com/blog?categoryID=5',          'a',    {'class':['blogLinks']},                               []],
+                # ["Stratfor",            'http://www.billoreilly.com/blog?categoryID=5',          'a',    {'class':['blogLinks']},                               []],
-                ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12',        'td',   {},                                                    []],
+                # ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12',        'td',   {},                                                    []],
                ["Current Column",      'https://www.billoreilly.com/currentcolumn',             'span', {'class':['defaultHeader']},                           []]
              ]
    feeds          = [
        (u'No Spin',        u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'),
        (u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'), 
        (u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'),
        (u'Blog',           u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'),
        (u'StratFor',       u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5')
    ]
    # http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day.              
    # Note: Talking Points is broken in the above model; the site changed to more Ajax-y.
    # Now using RSS
    def get_browser(self):
        print("In get_browser")
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
@ -66,6 +94,7 @@ class OReillyPremium(BasicNewsRecipe):
    def stripBadChars(self, inString) :
        return inString.replace("\'", "")
    def parseGeneric(self, baseURL):
        # Does a generic parsing of the articles.  There are six categories (0-5) 
        # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
@ -73,6 +102,7 @@ class OReillyPremium(BasicNewsRecipe):
        fullReturn = []
        for i in range(len(self.catList)) : 
            articleList = []
            print("In "+self.catList[i][0]+", index: "+ str(i))
            soup = self.index_to_soup(self.catList[i][1])
            # Set defaults
            description = 'None'
@ -81,14 +111,12 @@ class OReillyPremium(BasicNewsRecipe):
            # 3-5 create one.
            # So no for-div for 3-5
-            if i < 3 :
+            if i == 0 :
                print("Starting TV Archives")
                for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
                     print("Next DIV:")
                     print(div)
-                     if i == 1:
+                     a = div
                        a = div.find('a', href=True)
                     else :
                        a = div
                     print(a)
                     summary = div.find(True, attrs={'class':'summary'})
                     if summary:
                         description = self.tag_to_string(summary, use_alt=False)
@ -96,82 +124,63 @@ class OReillyPremium(BasicNewsRecipe):
                         continue
                     # url = baseURL+re.sub(r'\?.*', '', a['href'])
                     url = baseURL+a['href']
-                     if i < 2 :
+                     url = self.extractPrintURL(baseURL, url, "Print this entry")
-                        url = self.extractPrintURL(baseURL, url, "Print this entry")
+                     title = self.tag_to_string(a, use_alt=True).strip()
                        title = self.tag_to_string(a, use_alt=True).strip()
                     elif i == 2 :
                        # Daily Briefs
                        url = self.extractPrintURL(baseURL, url, "Print this entry")
                        title =  div.contents[0]
                     if self.debugMessages :
                        print(title+" @ "+url)
                     articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
            elif i == 3 :   # Stratfor
                a = soup.find('a', self.catList[i][3])
                if a is None :
                    continue
                url = baseURL+a['href']
                title = self.tag_to_string(a, use_alt=True).strip()
                # Get Stratfor contents so we can get the real title.
                stratSoup = self.index_to_soup(url)
                title = stratSoup.html.head.title.string
                stratIndex = title.find('Stratfor.com:', 0)
                if (stratIndex > -1) :
                    title = title[stratIndex+14:-1]
                # Look for first blogBody  <td class="blogBody"
                # Changed 12 Jan 2012 - new page format
                #stratBlogTable = stratSoup.find('td', {'class':['blogBody']}).findParent('table')
                #stratBody = stratSoup.find('td', {'class':['blogBody']})
            elif i == 4 :      # Talking Points
                topDate =  soup.find("td", "blogBody")
                if not topDate :
                    print("Failed to find date in Talking Points")
                # This page has the contents in double-wrapped tables!
                myTable = topDate.findParents('table')[0]
                if myTable is not None:
                    upOneTable = myTable.findParents('table')[0]
                    if upOneTable is not None:
                        upTwo = upOneTable.findParents('table')[0]
                if upTwo is None:
                    continue
                # Now navigate rows of upTwo
                if self.debugMessages :
                    print("Entering rows")
                for rows in upTwo.findChildren("tr", recursive=False):
                    # Inside top level table, each row is an article
                    rowTable = rows.find("table")
                    articleTable = rowTable.find("table")
                    # This looks wrong.
                    articleTable = rows.find("tr")
                    # The middle table is just for formatting the article buffer... but this means we can skip the inner table.
                    blogDate = articleTable.find("a","blogDate").contents[0]
                    # Skip to second blogBody for this.
                    blogTitle = articleTable.findAll("td", "blogBody")[1].contents[0]
                    blogURL = articleTable.find("a", "homeBlogReadMore bold")['href']
                    url = baseURL+re.sub(r'\?.*', '', blogURL)
                    title = blogDate+": "+self.stripBadChars(blogTitle.replace("Bill O'Reilly: ", ""))
                    if self.debugMessages :
                        print("Talking Points Memo title "+title+" at url: "+url)
                    pubdate = time.strftime('%a, %d %b')
                    articleList.append(dict(title=title, url=url, date=pubdate, description='None', content=''))
            else :       # Current Column
                titleSpan = soup.find(self.catList[i][2], self.catList[i][3])
                if titleSpan is None :
                    print("No Current Column Title Span")
                    print(soup)
                    continue
                title = titleSpan.contents[0]
                url = self.extractPrintURL(baseURL, self.catList[i][1], "Print This Article")
-            if i == 3 or i == 5 :
+            if i == 1 :
                 if self.debugMessages :
                    print(self.catList[i][0]+" Title:"+title+" at url: "+url)
                 summary = div.find(True, attrs={'class':'summary'})
-                 if summary:
+                 print("At Summary")
                 print(summary)
                 if summary is not None:
                     description = self.tag_to_string(summary, use_alt=False)
                 print("At append")
                 articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
            self.catList[i][3] = articleList
            fullReturn.append((self.catList[i][0], articleList))
            print("Returning")
            # print fullReturn
        return fullReturn
    # build_index() starts with:
    #     try:
    #        feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
    #                                 max_articles_per_feed=self.max_articles_per_feed,
    #                                 log=self.log)
    #        self.report_progress(0, _('Got feeds from index page'))
    #    except NotImplementedError:
    #        feeds = self.parse_feeds()
    # which in turn is from __init__.py
    #def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100,
    #    log=default_log):
    #'''
    #@param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}.
    #@return: A list of L{Feed} objects.
    #@rtype: list
    #'''
    #feeds = []
    #for title, articles in index:
    #    pfeed = Feed(log=log)
    #    pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
    #                                   max_articles_per_feed=max_articles_per_feed)
    #    feeds.append(pfeed)
    #           return feeds
    #  use_embedded_content defaults to None, at which point if the content is > 2K, it is used as the article.
    # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
    # returns a list of tuple ('feed title', list of articles)
    # {
@ -182,12 +191,19 @@ class OReillyPremium(BasicNewsRecipe):
    # 'content'     : The full article (can be an empty string). This is used by FullContentProfile
    # }
    # this is used instead of BasicNewsRecipe.parse_feeds().
    # it is called by download
    def parse_index(self):
        # Parse the page into Python Soup
        print("Entering recipe print_index from:")
        traceback.print_stack()
        print("web")
        baseURL = "https://www.billoreilly.com"
-        return self.parseGeneric(baseURL)
+        masterList = self.parseGeneric(baseURL)
        #print(masterList)
        return masterList
    def preprocess_html(self, soup):
        print("In preprocess_html")
        refresh = soup.find('meta', {'http-equiv':'refresh'})
        if refresh is None:
            return soup
@ -195,3 +211,128 @@ class OReillyPremium(BasicNewsRecipe):
        raw = self.browser.open('https://www.billoreilly.com'+content).read()
        return BeautifulSoup(raw.decode('cp1252', 'replace'))
    def build_index(self):
        print("In OReilly build_index()\n\n")
        feedsRSS = []
        self.report_progress(0, _('Fetching feeds...'))
        #try:
        feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
                                 max_articles_per_feed=self.max_articles_per_feed,
                                 log=self.log)
        self.report_progress(0, _('Got feeds from index page'))
        #except NotImplementedError:
        #    feeds = self.parse_feeds()
        # Now add regular feeds.    
        feedsRSS = self.parse_feeds()
        print ("feedsRSS is type "+feedsRSS.__class__.__name__)
        for articles in feedsRSS:
            print("articles is type "+articles.__class__.__name__)
            print("Title:" + articles.title)
            feeds.append(articles)
        if not feeds:
            raise ValueError('No articles found, aborting')
        #feeds = FeedCollection(feeds)
        self.report_progress(0, _('Trying to download cover...'))
        self.download_cover()
        self.report_progress(0, _('Generating masthead...'))
        self.masthead_path = None
        try:
            murl = self.get_masthead_url()
        except:
            self.log.exception('Failed to get masthead url')
            murl = None
        if murl is not None:
            # Try downloading the user-supplied masthead_url
            # Failure sets self.masthead_path to None
            self.download_masthead(murl)
        if self.masthead_path is None:
            self.log.info("Synthesizing mastheadImage")
            self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg')
            try:
                self.default_masthead_image(self.masthead_path)
            except:
                self.log.exception('Failed to generate default masthead image')
                self.masthead_path = None
        if self.test:
            feeds = feeds[:2]
        self.has_single_feed = len(feeds) == 1
        index = os.path.join(self.output_dir, 'index.html')
        html = self.feeds2index(feeds)
        with open(index, 'wb') as fi:
            fi.write(html)
        self.jobs = []
        if self.reverse_article_order:
            for feed in feeds:
                if hasattr(feed, 'reverse'):
                    feed.reverse()
        self.feed_objects = feeds
        for f, feed in enumerate(feeds):
            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
            if not os.path.isdir(feed_dir):
                os.makedirs(feed_dir)
            for a, article in enumerate(feed):
                if a >= self.max_articles_per_feed:
                    break
                art_dir = os.path.join(feed_dir, 'article_%d'%a)
                if not os.path.isdir(art_dir):
                    os.makedirs(art_dir)
                try:
                    url = self.print_version(article.url)
                except NotImplementedError:
                    url = article.url
                except:
                    self.log.exception('Failed to find print version for: '+article.url)
                    url = None
                if not url:
                    continue
                func, arg = (self.fetch_embedded_article, article) \
                            if self.use_embedded_content or (self.use_embedded_content == None and feed.has_embedded_content()) \
                            else \
                            ((self.fetch_obfuscated_article if self.articles_are_obfuscated \
                              else self.fetch_article), url)
                req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
                                      {}, (f, a), self.article_downloaded,
                                      self.error_in_article_download)
                req.feed = feed
                req.article = article
                req.feed_dir = feed_dir
                self.jobs.append(req)
        self.jobs_done = 0
        tp = ThreadPool(self.simultaneous_downloads)
        for req in self.jobs:
            tp.putRequest(req, block=True, timeout=0)
        self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
        while True:
            try:
                tp.poll()
                time.sleep(0.1)
            except NoResultsPending:
                break
        for f, feed in enumerate(feeds):
            print("Writing feeds for "+feed.title)
            html = self.feed2index(f,feeds)
            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
            with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
                fi.write(html)
        self.create_opf(feeds)
        self.report_progress(1, _('Feeds downloaded to %s')%index)
        return index
--- a/recipes/real_clear.recipe
+++ b/recipes/real_clear.recipe
@ -1,7 +1,9 @@
 #  Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug
 import string, re
 import time
 from urlparse import urlparse
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import NavigableString
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString
 class RealClear(BasicNewsRecipe):
    title           = u'Real Clear'
@ -20,12 +22,13 @@ class RealClear(BasicNewsRecipe):
    # Don't go down
    recursions      = 0
    max_articles_per_feed = 400
-    debugMessages = False
+    debugMessages = True
    # Numeric parameter is type, controls whether we look for 
    feedsets = [
-                ["Politics",        "http://www.realclearpolitics.com/index.xml", 0],
+                ["Politics",        "http://www.realclearpolitics.com/index.xml",   0],
-                ["Science",         "http://www.realclearscience.com/index.xml", 0],
+                ["Policy",           "http://www.realclearpolicy.com/index.xml", 0],
                ["Science",         "http://www.realclearscience.com/index.xml",    0],
                ["Tech",            "http://www.realcleartechnology.com/index.xml", 0],
                # The feedburner is essentially the same as the top feed, politics.
                # ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1],
@ -37,7 +40,9 @@ class RealClear(BasicNewsRecipe):
            ]
    # Hints to extractPrintURL.
    # First column is the URL snippet.  Then the string to search for as text, and the attributes to look for above it.  Start with attributes and drill down.
-    printhints = [
+    phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
    printhints = [  ["realclear",           "",                            '' , 'printpage'],
                    ["billoreilly.com",     "Print this entry",            'a', ''],
                    ["billoreilly.com",     "Print This Article",          'a', ''],
                    ["politico.com",        "Print",                       'a', 'share-print'],    
@ -48,11 +53,24 @@ class RealClear(BasicNewsRecipe):
                    # usatoday - just prints with all current crap anyhow
            ]
     # RCP - look for a strange compound.  See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html
     # The print link isn't obvious, and only the end is needed (the -full append.)  SO maybe try that first?s
     # http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
     # Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
     # Use the FULL PRINTPAGE URL; it formats it better too!
     #
     # NYT - try single page...
     # Need special code - is it one page or several?  Which URL?
     # from  http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
     # to    http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
     # which is at link rel="canonical"   and at        <meta property="og:url"    or look for "Single Page"
    # Returns the best-guess print url.
    # The second parameter (pageURL) is returned if nothing is found.
    def extractPrintURL(self, pageURL):
        tagURL = pageURL
        baseParse = urlparse(pageURL)
        baseURL = baseParse[0]+"://"+baseParse[1]
        hintsCount =len(self.printhints)
        for x in range(0,hintsCount):
            if pageURL.find(self.printhints[x][0])== -1 :
@ -62,23 +80,37 @@ class RealClear(BasicNewsRecipe):
            soup = self.index_to_soup(pageURL)
            if soup is None:
                return pageURL
-            if len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
+            if  len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0:
                # e.g. RealClear
                if self.debugMessages == True :
-                    print("search1")
+                    print("Search by href: "+self.printhints[x][self.phHrefSearch])
                printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch]))
            elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
                if self.debugMessages == True :
                    print("Search 1: "+self.printhints[x][2]+" Attributes: ")
                    print(self.printhints[x][3])
                printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
            elif  len(self.printhints[x][3])>0 :
                if self.debugMessages == True :
                    print("search2")
                printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
            else :
                if self.debugMessages == True:
                    print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1])
                printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
            if printFind is None:
                if self.debugMessages == True :
                    print("Not Found")
                    # print(soup)
                    print("end soup\n\n");
                continue
            print(printFind)
            if isinstance(printFind, NavigableString)==False:
                if printFind['href'] is not None:
                    print("Check "+printFind['href']+" for base of "+baseURL)
                    if printFind['href'].find("http")!=0 :
                        return baseURL+printFind['href']
                    return printFind['href']
            tag = printFind.parent
            print(tag)
@ -158,6 +190,7 @@ class RealClear(BasicNewsRecipe):
    def parse_index(self):
        # Parse the page into Python Soup
        articleList = []
        ans = []
        feedsCount = len(self.feedsets)
        for x in range(0,feedsCount): # should be ,4
@ -168,3 +201,4 @@ class RealClear(BasicNewsRecipe):
            print(ans)
        return ans
--- a/recipes/soldiers.recipe
+++ b/recipes/soldiers.recipe
@ -15,6 +15,8 @@ class Soldiers(BasicNewsRecipe):
    max_articles_per_feed  = 100
    no_stylesheets         = True
    use_embedded_content   = False
    auto_cleanup = True
    auto_cleanup_keep = '//div[@id="mediaWrapper"]'
    simultaneous_downloads = 1
    delay                  = 4
    max_connections        = 1    
@ -31,14 +33,14 @@ class Soldiers(BasicNewsRecipe):
                        , 'language'         : language
                        }
-    keep_only_tags = [dict(name='div', attrs={'id':['storyHeader','textArea']})]
+    #keep_only_tags = [dict(name='div', attrs={'id':['storyHeader','textArea']})]
-    remove_tags = [
+    #remove_tags = [
-                     dict(name='div', attrs={'id':['addThis','comment','articleFooter']})
+                     #dict(name='div', attrs={'id':['addThis','comment','articleFooter']})
-                    ,dict(name=['object','link'])
+                    #,dict(name=['object','link'])
-                  ]
+                  #]
-    feeds = [(u'Frontpage', u'http://www.army.mil/rss/feeds/soldiersfrontpage.xml' )]
+    feeds = [(u'Frontpage', u'http://www.army.mil/rss/2/' )]
    def get_cover_url(self):
--- a/setup/installer/windows/freeze.py
+++ b/setup/installer/windows/freeze.py
@ -14,7 +14,7 @@ from setup.build_environment import msvc, MT, RC
 from setup.installer.windows.wix import WixMixIn
 OPENSSL_DIR = r'Q:\openssl'
-QT_DIR = 'Q:\\Qt\\4.8.0'
+QT_DIR = 'Q:\\Qt\\4.8.1'
 QT_DLLS = ['Core', 'Gui', 'Network', 'Svg', 'WebKit', 'Xml', 'XmlPatterns']
 LIBUNRAR         = 'C:\\Program Files\\UnrarDLL\\unrar.dll'
 SW               = r'C:\cygwin\home\kovid\sw'
--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@ -107,6 +107,7 @@ class ANDROID(USBMS):
                    0xc004 : [0x0226],
                    0x8801 : [0x0226, 0x0227],
                    0xe115 : [0x0216], # PocketBook A10
                    0xe107 : [0x326], # PocketBook 622
            },
            # Acer
--- a/src/calibre/ebooks/metadata/sources/worker.py
+++ b/src/calibre/ebooks/metadata/sources/worker.py
@ -0,0 +1,95 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import os
 from threading import Event
 from io import BytesIO
 from calibre.utils.date import as_utc
 from calibre.ebooks.metadata.sources.identify import identify, msprefs
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.customize.ui import metadata_plugins
 from calibre.ebooks.metadata.sources.covers import download_cover
 from calibre.utils.logging import GUILog
 from calibre.ebooks.metadata.opf2 import metadata_to_opf, OPF
 def merge_result(oldmi, newmi, ensure_fields=None):
    dummy = Metadata(_('Unknown'))
    for f in msprefs['ignore_fields']:
        if ':' in f or (ensure_fields and f in ensure_fields):
            continue
        setattr(newmi, f, getattr(dummy, f))
    fields = set()
    for plugin in metadata_plugins(['identify']):
        fields |= plugin.touched_fields
    def is_equal(x, y):
        if hasattr(x, 'tzinfo'):
            x = as_utc(x)
        if hasattr(y, 'tzinfo'):
            y = as_utc(y)
        return x == y
    for f in fields:
        # Optimize so that set_metadata does not have to do extra work later
        if not f.startswith('identifier:'):
            if (not newmi.is_null(f) and is_equal(getattr(newmi, f),
                    getattr(oldmi, f))):
                setattr(newmi, f, getattr(dummy, f))
    return newmi
 def main(do_identify, covers, metadata, ensure_fields):
    failed_ids = set()
    failed_covers = set()
    all_failed = True
    log = GUILog()
    for book_id, mi in metadata.iteritems():
        mi = OPF(BytesIO(mi), basedir=os.getcwdu(),
                populate_spine=False).to_book_metadata()
        title, authors, identifiers = mi.title, mi.authors, mi.identifiers
        cdata = None
        log.clear()
        if do_identify:
            results = []
            try:
                results = identify(log, Event(), title=title, authors=authors,
                    identifiers=identifiers)
            except:
                pass
            if results:
                all_failed = False
                mi = merge_result(mi, results[0], ensure_fields=ensure_fields)
                identifiers = mi.identifiers
                if not mi.is_null('rating'):
                    # set_metadata expects a rating out of 10
                    mi.rating *= 2
                with open('%d.mi'%book_id, 'wb') as f:
                    f.write(metadata_to_opf(mi, default_lang='und'))
            else:
                log.error('Failed to download metadata for', title)
                failed_ids.add(book_id)
        if covers:
            cdata = download_cover(log, title=title, authors=authors,
                    identifiers=identifiers)
            if cdata is None:
                failed_covers.add(book_id)
            else:
                with open('%d.cover'%book_id, 'wb') as f:
                    f.write(cdata[-1])
                all_failed = False
        with open('%d.log'%book_id, 'wb') as f:
            f.write(log.plain_text.encode('utf-8'))
    return failed_ids, failed_covers, all_failed
--- a/src/calibre/ebooks/mobi/reader/mobi8.py
+++ b/src/calibre/ebooks/mobi/reader/mobi8.py
@ -10,13 +10,19 @@ __docformat__ = 'restructuredtext en'
 import struct, re, os, imghdr
 from collections import namedtuple
 from itertools import repeat
 from urlparse import urldefrag
 from lxml import etree
 from calibre.ebooks.mobi.reader.headers import NULL_INDEX
 from calibre.ebooks.mobi.reader.index import read_index
 from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc
 from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup
 from calibre.ebooks.metadata.opf2 import Guide, OPFCreator
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.mobi.utils import read_font_record
 from calibre.ebooks.oeb.parse_utils import parse_html
 from calibre.ebooks.oeb.base import XPath, XHTML, xml2text
 Part = namedtuple('Part',
    'num type filename start end aid')
@ -383,6 +389,19 @@ class Mobi8Reader(object):
                len(resource_map)):
            mi.cover = resource_map[self.cover_offset]
        if len(list(toc)) < 2:
            self.log.warn('KF8 has no metadata Table of Contents')
            for ref in guide:
                if ref.type == 'toc':
                    href = ref.href()
                    href, frag = urldefrag(href)
                    if os.path.exists(href.replace('/', os.sep)):
                        try:
                            toc = self.read_inline_toc(href, frag)
                        except:
                            self.log.exception('Failed to read inline ToC')
        opf = OPFCreator(os.getcwdu(), mi)
        opf.guide = guide
@ -397,4 +416,70 @@ class Mobi8Reader(object):
            opf.render(of, ncx, 'toc.ncx')
        return 'metadata.opf'
    def read_inline_toc(self, href, frag):
        ans = TOC()
        base_href = '/'.join(href.split('/')[:-1])
        with open(href.replace('/', os.sep), 'rb') as f:
            raw = f.read().decode(self.header.codec)
        root = parse_html(raw, log=self.log)
        body = XPath('//h:body')(root)
        reached = False
        if body:
            start = body[0]
        else:
            start = None
            reached = True
        if frag:
            elems = XPath('//*[@id="%s"]'%frag)
            if elems:
                start = elems[0]
        def node_depth(elem):
            ans = 0
            parent = elem.getparent()
            while parent is not None:
                parent = parent.getparent()
                ans += 1
            return ans
        # Layer the ToC based on nesting order in the source HTML
        current_depth = None
        parent = ans
        seen = set()
        links = []
        for elem in root.iterdescendants(etree.Element):
            if reached and elem.tag == XHTML('a') and elem.get('href',
                    False):
                href = elem.get('href')
                href, frag = urldefrag(href)
                href = base_href + '/' + href
                text = xml2text(elem).strip()
                if (text, href, frag) in seen:
                    continue
                seen.add((text, href, frag))
                links.append((text, href, frag, node_depth(elem)))
            elif elem is start:
                reached = True
        depths = sorted(set(x[-1] for x in links))
        depth_map = {x:i for i, x in enumerate(depths)}
        for text, href, frag, depth in links:
            depth = depth_map[depth]
            if current_depth is None:
                current_depth = 0
                parent.add_item(href, frag, text)
            elif current_depth == depth:
                parent.add_item(href, frag, text)
            elif current_depth < depth:
                parent = parent[-1] if len(parent) > 0 else parent
                parent.add_item(href, frag, text)
                current_depth += 1
            else:
                delta = current_depth - depth
                while delta > 0 and parent.parent is not None:
                    parent = parent.parent
                    delta -= 1
                parent.add_item(href, frag, text)
                current_depth = depth
        return ans
--- a/src/calibre/ebooks/pdf/writer.py
+++ b/src/calibre/ebooks/pdf/writer.py
@ -40,27 +40,34 @@ def get_custom_size(opts):
                custom_size = None
    return custom_size
-def get_pdf_printer(opts, for_comic=False):
+def get_pdf_printer(opts, for_comic=False, output_file_name=None):
    from calibre.gui2 import is_ok_to_use_qt
    if not is_ok_to_use_qt():
        raise Exception('Not OK to use Qt')
    printer = QPrinter(QPrinter.HighResolution)
    custom_size = get_custom_size(opts)
-
+    if isosx and not for_comic:
-    if opts.output_profile.short_name == 'default' or \
+        # On OSX, the native engine can only produce a single page size
-            opts.output_profile.width > 9999:
+        # (usually A4). The Qt engine on the other hand produces image based
-        if custom_size is None:
+        # PDFs. If we set a custom page size using QSizeF the native engine
-            printer.setPaperSize(paper_size(opts.paper_size))
+        # produces unreadable output, so we just ignore the custom size
-        else:
+        # settings.
-            printer.setPaperSize(QSizeF(custom_size[0], custom_size[1]), unit(opts.unit))
+        printer.setPaperSize(paper_size(opts.paper_size))
    else:
-        w = opts.output_profile.comic_screen_size[0] if for_comic else \
+        if opts.output_profile.short_name == 'default' or \
-                opts.output_profile.width
+                opts.output_profile.width > 9999:
-        h = opts.output_profile.comic_screen_size[1] if for_comic else \
+            if custom_size is None:
-                opts.output_profile.height
+                printer.setPaperSize(paper_size(opts.paper_size))
-        dpi = opts.output_profile.dpi
+            else:
-        printer.setPaperSize(QSizeF(float(w) / dpi, float(h) / dpi), QPrinter.Inch)
+                printer.setPaperSize(QSizeF(custom_size[0], custom_size[1]), unit(opts.unit))
        else:
            w = opts.output_profile.comic_screen_size[0] if for_comic else \
                    opts.output_profile.width
            h = opts.output_profile.comic_screen_size[1] if for_comic else \
                    opts.output_profile.height
            dpi = opts.output_profile.dpi
            printer.setPaperSize(QSizeF(float(w) / dpi, float(h) / dpi), QPrinter.Inch)
    if for_comic:
        # Comic pages typically have their own margins, or their background
@ -72,6 +79,12 @@ def get_pdf_printer(opts, for_comic=False):
    printer.setOrientation(orientation(opts.orientation))
    printer.setOutputFormat(QPrinter.PdfFormat)
    printer.setFullPage(for_comic)
    if output_file_name:
        printer.setOutputFileName(output_file_name)
    if isosx and not for_comic:
        # Ensure we are not generating enormous image based PDFs
        printer.setOutputFormat(QPrinter.NativeFormat)
    return printer
 def get_printer_page_size(opts, for_comic=False):
@ -163,15 +176,7 @@ class PDFWriter(QObject): # {{{
        if ok:
            item_path = os.path.join(self.tmp_path, '%i.pdf' % len(self.combine_queue))
            self.logger.debug('\tRendering item %s as %i.pdf' % (os.path.basename(str(self.view.url().toLocalFile())), len(self.combine_queue)))
-            printer = get_pdf_printer(self.opts)
+            printer = get_pdf_printer(self.opts, output_file_name=item_path)
            printer.setOutputFileName(item_path)
            # We have to set the engine to Native on OS X after the call to set
            # filename. Setting a filename with .pdf as the extension causes
            # Qt to set the format to use Qt's PDF engine even if native was
            # previously set on the printer. Qt's PDF engine produces image
            # based PDFs on OS X, so we cannot use it.
            if isosx:
                printer.setOutputFormat(QPrinter.NativeFormat)
            self.view.page().mainFrame().evaluateJavaScript('''
                document.body.style.backgroundColor = "white";
@ -193,10 +198,7 @@ class PDFWriter(QObject): # {{{
        if self.cover_data is None:
            return
        item_path = os.path.join(self.tmp_path, 'cover.pdf')
-        printer = get_pdf_printer(self.opts)
+        printer = get_pdf_printer(self.opts, output_file_name=item_path)
        printer.setOutputFileName(item_path)
        if isosx:
            printer.setOutputFormat(QPrinter.NativeFormat)
        self.combine_queue.insert(0, item_path)
        p = QPixmap()
        p.loadFromData(self.cover_data)
@ -248,10 +250,8 @@ class ImagePDFWriter(object):
            os.remove(f.name)
    def render_images(self, outpath, mi, items):
-        printer = get_pdf_printer(self.opts, for_comic=True)
+        printer = get_pdf_printer(self.opts, for_comic=True,
-        printer.setOutputFileName(outpath)
+                output_file_name=outpath)
        if isosx:
            printer.setOutputFormat(QPrinter.NativeFormat)
        printer.setDocName(mi.title)
        printer.setCreator(u'%s [%s]'%(__appname__, __version__))
        # Seems to be no way to set author
--- a/src/calibre/gui2/init.py
+++ b/src/calibre/gui2/init.py
@ -105,6 +105,7 @@ gprefs.defaults['show_files_after_save'] = True
 gprefs.defaults['auto_add_path'] = None
 gprefs.defaults['auto_add_check_for_duplicates'] = False
 gprefs.defaults['blocked_auto_formats'] = []
 gprefs.defaults['auto_add_auto_convert'] = True
 # }}}
 NONE = QVariant() #: Null value to return from the data function of item models
--- a/src/calibre/gui2/actions/add.py
+++ b/src/calibre/gui2/actions/add.py
@ -71,7 +71,7 @@ class AddAction(InterfaceAction):
        ma('add-formats', _('Add files to selected book records'),
                triggered=self.add_formats, shortcut=_('Shift+A'))
        self.add_menu.addSeparator()
-        ma('add-config', _('Configure the adding of books'),
+        ma('add-config', _('Control the adding of books'),
                triggered=self.add_config)
        self.qaction.triggered.connect(self.add_books)
--- a/src/calibre/gui2/actions/convert.py
+++ b/src/calibre/gui2/actions/convert.py
@ -53,6 +53,24 @@ class ConvertAction(InterfaceAction):
        self.queue_convert_jobs(jobs, changed, bad, rows, previous,
                self.book_auto_converted, extra_job_args=[on_card])
    def auto_convert_auto_add(self, book_ids):
        previous = self.gui.library_view.currentIndex()
        db = self.gui.current_db
        needed = set()
        of = prefs['output_format'].lower()
        for book_id in book_ids:
            fmts = db.formats(book_id, index_is_id=True)
            fmts = set(x.lower() for x in fmts.split(',')) if fmts else set()
            if of not in fmts:
                needed.add(book_id)
        if needed:
            jobs, changed, bad = convert_single_ebook(self.gui,
                    self.gui.library_view.model().db, needed, True, of,
                    show_no_format_warning=False)
            if not jobs: return
            self.queue_convert_jobs(jobs, changed, bad, list(needed), previous,
                    self.book_converted, rows_are_ids=True)
    def auto_convert_mail(self, to, fmts, delete_from_library, book_ids, format, subject):
        previous = self.gui.library_view.currentIndex()
        rows = [x.row() for x in \
@ -118,7 +136,7 @@ class ConvertAction(InterfaceAction):
                num, 2000)
    def queue_convert_jobs(self, jobs, changed, bad, rows, previous,
-            converted_func, extra_job_args=[]):
+            converted_func, extra_job_args=[], rows_are_ids=False):
        for func, args, desc, fmt, id, temp_files in jobs:
            func, _, same_fmt = func.partition(':')
            same_fmt = same_fmt == 'same_fmt'
@ -140,7 +158,11 @@ class ConvertAction(InterfaceAction):
                self.conversion_jobs[job] = tuple(args)
        if changed:
-            self.gui.library_view.model().refresh_rows(rows)
+            m = self.gui.library_view.model()
            if rows_are_ids:
                m.refresh_ids(rows)
            else:
                m.refresh_rows(rows)
            current = self.gui.library_view.currentIndex()
            self.gui.library_view.model().current_changed(current, previous)
--- a/src/calibre/gui2/actions/edit_metadata.py
+++ b/src/calibre/gui2/actions/edit_metadata.py
@ -5,7 +5,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
-import os
+import os, shutil
 from functools import partial
 from PyQt4.Qt import QMenu, QModelIndex, QTimer
@ -16,6 +16,7 @@ from calibre.gui2.dialogs.confirm_delete import confirm
 from calibre.gui2.dialogs.device_category_editor import DeviceCategoryEditor
 from calibre.gui2.actions import InterfaceAction
 from calibre.ebooks.metadata import authors_to_string
 from calibre.ebooks.metadata.opf2 import OPF
 from calibre.utils.icu import sort_key
 from calibre.db.errors import NoSuchFormat
@ -79,14 +80,23 @@ class EditMetadataAction(InterfaceAction):
                Dispatcher(self.metadata_downloaded),
                ensure_fields=ensure_fields)
    def cleanup_bulk_download(self, tdir):
        try:
            shutil.rmtree(tdir, ignore_errors=True)
        except:
            pass
    def metadata_downloaded(self, job):
        if job.failed:
            self.gui.job_exception(job, dialog_title=_('Failed to download metadata'))
            return
        from calibre.gui2.metadata.bulk_download import get_job_details
-        id_map, failed_ids, failed_covers, all_failed, det_msg = \
+        (aborted, id_map, tdir, log_file, failed_ids, failed_covers, all_failed,
-                                            get_job_details(job)
+                det_msg, lm_map) = get_job_details(job)
        if aborted:
            return self.cleanup_bulk_download(tdir)
        if all_failed:
            self.cleanup_bulk_download(tdir)
            return error_dialog(self.gui, _('Download failed'),
            _('Failed to download metadata or covers for any of the %d'
               ' book(s).') % len(id_map), det_msg=det_msg, show=True)
@ -103,28 +113,26 @@ class EditMetadataAction(InterfaceAction):
            msg += '<p>'+_('Could not download metadata and/or covers for %d of the books. Click'
                    ' "Show details" to see which books.')%num
-        payload = (id_map, failed_ids, failed_covers)
+        payload = (id_map, tdir, log_file, lm_map)
        from calibre.gui2.dialogs.message_box import ProceedNotification
        p = ProceedNotification(self.apply_downloaded_metadata,
-                payload, job.html_details,
+                payload, log_file,
                _('Download log'), _('Download complete'), msg,
                det_msg=det_msg, show_copy_button=show_copy_button,
-                parent=self.gui)
+                parent=self.gui, log_is_file=True)
        p.show()
    def apply_downloaded_metadata(self, payload):
-        id_map, failed_ids, failed_covers = payload
+        good_ids, tdir, log_file, lm_map = payload
-        id_map = dict([(k, v) for k, v in id_map.iteritems() if k not in
+        if not good_ids:
            failed_ids])
        if not id_map:
            return
        modified = set()
        db = self.gui.current_db
-        for i, mi in id_map.iteritems():
+        for i in good_ids:
            lm = db.metadata_last_modified(i, index_is_id=True)
-            if lm > mi.last_modified:
+            if lm > lm_map[i]:
                title = db.title(i, index_is_id=True)
                authors = db.authors(i, index_is_id=True)
                if authors:
@ -144,7 +152,18 @@ class EditMetadataAction(InterfaceAction):
                        'Do you want to proceed?'), det_msg='\n'.join(modified)):
                return
-        self.apply_metadata_changes(id_map)
+        id_map = {}
        for bid in good_ids:
            opf = os.path.join(tdir, '%d.mi'%bid)
            if not os.path.exists(opf):
                opf = None
            cov = os.path.join(tdir, '%d.cover'%bid)
            if not os.path.exists(cov):
                cov = None
            id_map[bid] = (opf, cov)
        self.apply_metadata_changes(id_map, callback=lambda x:
                self.cleanup_bulk_download(tdir))
    # }}}
@ -468,6 +487,11 @@ class EditMetadataAction(InterfaceAction):
        callback can be either None or a function accepting a single argument,
        in which case it is called after applying is complete with the list of
        changed ids.
        id_map can also be a mapping of ids to 2-tuple's where each 2-tuple
        contains the absolute paths to an OPF and cover file respectively. If
        either of the paths is None, then the corresponding metadata is not
        updated.
        '''
        if title is None:
            title = _('Applying changed metadata')
@ -492,28 +516,48 @@ class EditMetadataAction(InterfaceAction):
            return self.finalize_apply()
        i, mi = self.apply_id_map[self.apply_current_idx]
        if isinstance(mi, tuple):
            opf, cover = mi
            if opf:
                mi = OPF(open(opf, 'rb'), basedir=os.path.dirname(opf),
                        populate_spine=False).to_book_metadata()
                self.apply_mi(i, mi)
            if cover:
                self.gui.current_db.set_cover(i, open(cover, 'rb'),
                        notify=False, commit=False)
        else:
            self.apply_mi(i, mi)
        self.apply_current_idx += 1
        if self.apply_pd is not None:
            self.apply_pd.value += 1
        QTimer.singleShot(50, self.do_one_apply)
    def apply_mi(self, book_id, mi):
        db = self.gui.current_db
        try:
            set_title = not mi.is_null('title')
            set_authors = not mi.is_null('authors')
-            idents = db.get_identifiers(i, index_is_id=True)
+            idents = db.get_identifiers(book_id, index_is_id=True)
            if mi.identifiers:
                idents.update(mi.identifiers)
            mi.identifiers = idents
            if mi.is_null('series'):
                mi.series_index = None
            if self._am_merge_tags:
-                old_tags = db.tags(i, index_is_id=True)
+                old_tags = db.tags(book_id, index_is_id=True)
                if old_tags:
                    tags = [x.strip() for x in old_tags.split(',')] + (
                            mi.tags if mi.tags else [])
                    mi.tags = list(set(tags))
-            db.set_metadata(i, mi, commit=False, set_title=set_title,
+            db.set_metadata(book_id, mi, commit=False, set_title=set_title,
                    set_authors=set_authors, notify=False)
-            self.applied_ids.append(i)
+            self.applied_ids.append(book_id)
        except:
            import traceback
-            self.apply_failures.append((i, traceback.format_exc()))
+            self.apply_failures.append((book_id, traceback.format_exc()))
        try:
            if mi.cover:
@ -521,11 +565,6 @@ class EditMetadataAction(InterfaceAction):
        except:
            pass
        self.apply_current_idx += 1
        if self.apply_pd is not None:
            self.apply_pd.value += 1
        QTimer.singleShot(50, self.do_one_apply)
    def finalize_apply(self):
        db = self.gui.current_db
        db.commit()
--- a/src/calibre/gui2/auto_add.py
+++ b/src/calibre/gui2/auto_add.py
@ -113,6 +113,7 @@ class Worker(Thread):
 class AutoAdder(QObject):
    metadata_read = pyqtSignal(object)
    auto_convert = pyqtSignal(object)
    def __init__(self, path, parent):
        QObject.__init__(self, parent)
@ -124,6 +125,8 @@ class AutoAdder(QObject):
            self.metadata_read.connect(self.add_to_db,
                    type=Qt.QueuedConnection)
            QTimer.singleShot(2000, self.initialize)
            self.auto_convert.connect(self.do_auto_convert,
                    type=Qt.QueuedConnection)
        elif path:
            prints(path,
                'is not a valid directory to watch for new ebooks, ignoring')
@ -163,6 +166,7 @@ class AutoAdder(QObject):
        needs_rescan = False
        duplicates = []
        added_ids = set()
        for fname, tdir in data.iteritems():
            paths = [os.path.join(self.worker.path, fname)]
@ -187,9 +191,12 @@ class AutoAdder(QObject):
                continue
            mi = [OPF(open(mi, 'rb'), tdir,
                    populate_spine=False).to_book_metadata()]
-            dups, num = m.add_books(paths,
+            dups, ids = m.add_books(paths,
                    [os.path.splitext(fname)[1][1:].upper()], mi,
-                    add_duplicates=not gprefs['auto_add_check_for_duplicates'])
+                    add_duplicates=not gprefs['auto_add_check_for_duplicates'],
                    return_ids=True)
            added_ids |= set(ids)
            num = len(ids)
            if dups:
                path = dups[0][0]
                with open(os.path.join(tdir, 'dup_cache.'+dups[1][0].lower()),
@ -217,8 +224,10 @@ class AutoAdder(QObject):
                        _('Books with the same title as the following already '
                        'exist in the database. Add them anyway?'),
                        '\n'.join(files)):
-             dups, num = m.add_books(paths, formats, metadata,
+             dups, ids = m.add_books(paths, formats, metadata,
-                     add_duplicates=True)
+                     add_duplicates=True, return_ids=True)
             added_ids |= set(ids)
             num = len(ids)
             count += num
        for tdir in data.itervalues():
@ -227,6 +236,9 @@ class AutoAdder(QObject):
            except:
                pass
        if added_ids and gprefs['auto_add_auto_convert']:
            self.auto_convert.emit(added_ids)
        if count > 0:
            m.books_added(count)
            gui.status_bar.show_message(_(
@ -238,4 +250,7 @@ class AutoAdder(QObject):
        if needs_rescan:
            QTimer.singleShot(2000, self.dir_changed)
    def do_auto_convert(self, added_ids):
        gui = self.parent()
        gui.iactions['Convert Books'].auto_convert_auto_add(added_ids)
--- a/src/calibre/gui2/dialogs/message_box.py
+++ b/src/calibre/gui2/dialogs/message_box.py
@ -160,7 +160,7 @@ class ProceedNotification(MessageBox): # {{{
    def __init__(self, callback, payload, html_log, log_viewer_title, title, msg,
            det_msg='', show_copy_button=False, parent=None,
-            cancel_callback=None):
+            cancel_callback=None, log_is_file=False):
        '''
        A non modal popup that notifies the user that a background task has
        been completed.
@ -175,12 +175,15 @@ class ProceedNotification(MessageBox): # {{{
        :param title: The title for this popup
        :param msg: The msg to display
        :param det_msg: Detailed message
        :param log_is_file: If True the html_log parameter is interpreted as
        the path to a file on disk containing the log encoded with utf-8
        '''
        MessageBox.__init__(self, MessageBox.QUESTION, title, msg,
                det_msg=det_msg, show_copy_button=show_copy_button,
                parent=parent)
        self.payload = payload
        self.html_log = html_log
        self.log_is_file = log_is_file
        self.log_viewer_title = log_viewer_title
        self.vlb = self.bb.addButton(_('View log'), self.bb.ActionRole)
@ -192,7 +195,11 @@ class ProceedNotification(MessageBox): # {{{
        _proceed_memory.append(self)
    def show_log(self):
-        self.log_viewer = ViewLog(self.log_viewer_title, self.html_log,
+        log = self.html_log
        if self.log_is_file:
            with open(log, 'rb') as f:
                log = f.read().decode('utf-8')
        self.log_viewer = ViewLog(self.log_viewer_title, log,
                parent=self)
    def do_proceed(self, result):
@ -202,9 +209,9 @@ class ProceedNotification(MessageBox): # {{{
        gui = get_gui()
        gui.proceed_requested.emit(func, self.payload)
        # Ensure this notification is garbage collected
        self.vlb.clicked.disconnect()
        self.callback = self.cancel_callback = self.payload = None
        self.setParent(None)
        self.vlb.clicked.disconnect()
        _proceed_memory.remove(self)
    def done(self, r):
--- a/src/calibre/gui2/dialogs/search.ui
+++ b/src/calibre/gui2/dialogs/search.ui
@ -140,34 +140,6 @@
            </item>
           </layout>
          </item>
          <item>
           <widget class="QGroupBox" name="groupBox">
            <property name="maximumSize">
             <size>
              <width>16777215</width>
              <height>60</height>
             </size>
            </property>
            <layout class="QHBoxLayout" name="horizontalLayout_5">
             <item>
              <widget class="QLabel" name="label_51">
               <property name="sizePolicy">
                <sizepolicy hsizetype="Preferred" vsizetype="Preferred">
                 <horstretch>40</horstretch>
                 <verstretch>0</verstretch>
                </sizepolicy>
               </property>
               <property name="text">
                <string/>
               </property>
               <property name="buddy">
                <cstring>matchkind</cstring>
               </property>
              </widget>
             </item>
            </layout>
           </widget>
          </item>
          <item>
           <widget class="QLabel" name="label_6">
            <property name="maximumSize">
--- a/src/calibre/gui2/jobs.py
+++ b/src/calibre/gui2/jobs.py
@ -402,7 +402,8 @@ class DetailView(QDialog, Ui_Dialog): # {{{
        self.setupUi(self)
        self.setWindowTitle(job.description)
        self.job = job
-        self.html_view = hasattr(job, 'html_details')
+        self.html_view = (hasattr(job, 'html_details') and not getattr(job,
            'ignore_html_details', False))
        if self.html_view:
            self.log.setVisible(False)
        else:
--- a/src/calibre/gui2/library/models.py
+++ b/src/calibre/gui2/library/models.py
@ -187,9 +187,10 @@ class BooksModel(QAbstractTableModel): # {{{
        self.db = None
        self.reset()
-    def add_books(self, paths, formats, metadata, add_duplicates=False):
+    def add_books(self, paths, formats, metadata, add_duplicates=False,
            return_ids=False):
        ret = self.db.add_books(paths, formats, metadata,
-                                 add_duplicates=add_duplicates)
+                add_duplicates=add_duplicates, return_ids=return_ids)
        self.count_changed()
        return ret
--- a/src/calibre/gui2/metadata/bulk_download.py
+++ b/src/calibre/gui2/metadata/bulk_download.py
@ -7,22 +7,41 @@ __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import os, time, shutil
 from functools import partial
 from itertools import izip
 from threading import Event
 from PyQt4.Qt import (QIcon, QDialog,
        QDialogButtonBox, QLabel, QGridLayout, QPixmap, Qt)
 from calibre.gui2.threaded_jobs import ThreadedJob
-from calibre.ebooks.metadata.sources.identify import identify, msprefs
+from calibre.ebooks.metadata.opf2 import metadata_to_opf
-from calibre.ebooks.metadata.sources.covers import download_cover
+from calibre.utils.ipc.simple_worker import fork_job, WorkerError
-from calibre.ebooks.metadata.book.base import Metadata
+from calibre.ptempfile import (PersistentTemporaryDirectory,
-from calibre.customize.ui import metadata_plugins
+        PersistentTemporaryFile)
 from calibre.ptempfile import PersistentTemporaryFile
 from calibre.utils.date import as_utc
 # Start download {{{
 class Job(ThreadedJob):
    ignore_html_details = True
    def consolidate_log(self):
        self.consolidated_log = self.log.plain_text
        self.log = None
    def read_consolidated_log(self):
        return self.consolidated_log
    @property
    def details(self):
        if self.consolidated_log is None:
            return self.log.plain_text
        return self.read_consolidated_log()
    @property
    def log_file(self):
        return open(self.download_debug_log, 'rb')
 def show_config(gui, parent):
    from calibre.gui2.preferences import show_config_widget
    show_config_widget('Sharing', 'Metadata download', parent=parent,
@ -104,19 +123,22 @@ def start_download(gui, ids, callback, ensure_fields=None):
    d.b.clicked.disconnect()
    if ret != d.Accepted:
        return
    tf = PersistentTemporaryFile('_metadata_bulk_log_')
    tf.close()
-    for batch in split_jobs(ids):
+    job = Job('metadata bulk download',
-        job = ThreadedJob('metadata bulk download',
+        _('Download metadata for %d books')%len(ids),
-            _('Download metadata for %d books')%len(batch),
+        download, (ids, tf.name, gui.current_db, d.identify, d.covers,
-            download, (batch, gui.current_db, d.identify, d.covers,
+            ensure_fields), {}, callback)
-                ensure_fields), {}, callback)
+    job.download_debug_log = tf.name
-        gui.job_manager.run_threaded_job(job)
+    gui.job_manager.run_threaded_job(job)
    gui.status_bar.show_message(_('Metadata download started'), 3000)
 # }}}
 def get_job_details(job):
-    id_map, failed_ids, failed_covers, title_map, all_failed = job.result
+    (aborted, good_ids, tdir, log_file, failed_ids, failed_covers, title_map,
            lm_map, all_failed) = job.result
    det_msg = []
    for i in failed_ids | failed_covers:
        title = title_map[i]
@ -126,92 +148,89 @@ def get_job_details(job):
            title += (' ' + _('(Failed cover)'))
        det_msg.append(title)
    det_msg = '\n'.join(det_msg)
-    return id_map, failed_ids, failed_covers, all_failed, det_msg
+    return (aborted, good_ids, tdir, log_file, failed_ids, failed_covers,
            all_failed, det_msg, lm_map)
-def merge_result(oldmi, newmi, ensure_fields=None):
+class HeartBeat(object):
-    dummy = Metadata(_('Unknown'))
+    CHECK_INTERVAL = 300 # seconds
-    for f in msprefs['ignore_fields']:
+    ''' Check that the file count in tdir changes every five minutes '''
        if ':' in f or (ensure_fields and f in ensure_fields):
            continue
        setattr(newmi, f, getattr(dummy, f))
    fields = set()
    for plugin in metadata_plugins(['identify']):
        fields |= plugin.touched_fields
-    def is_equal(x, y):
+    def __init__(self, tdir):
-        if hasattr(x, 'tzinfo'):
+        self.tdir = tdir
-            x = as_utc(x)
+        self.last_count = len(os.listdir(self.tdir))
-        if hasattr(y, 'tzinfo'):
+        self.last_time = time.time()
            y = as_utc(y)
        return x == y
-    for f in fields:
+    def __call__(self):
-        # Optimize so that set_metadata does not have to do extra work later
+        if time.time() - self.last_time > self.CHECK_INTERVAL:
-        if not f.startswith('identifier:'):
+            c = len(os.listdir(self.tdir))
-            if (not newmi.is_null(f) and is_equal(getattr(newmi, f),
+            if c == self.last_count:
-                    getattr(oldmi, f))):
+                return False
-                setattr(newmi, f, getattr(dummy, f))
+            self.last_count = c
            self.last_time = time.time()
        return True
-    newmi.last_modified = oldmi.last_modified
+# Fix log viewer, ratings
 # Test: abort, covers only, metadata only, both, 200 entry download, memory
 # consumption, all errors and on and on
-    return newmi
+def download(all_ids, tf, db, do_identify, covers, ensure_fields,
 def download(ids, db, do_identify, covers, ensure_fields,
        log=None, abort=None, notifications=None):
-    ids = list(ids)
+    batch_size = 10
-    metadata = [db.get_metadata(i, index_is_id=True, get_user_categories=False)
+    batches = split_jobs(all_ids, batch_size=batch_size)
-        for i in ids]
+    tdir = PersistentTemporaryDirectory('_metadata_bulk_')
    heartbeat = HeartBeat(tdir)
    failed_ids = set()
    failed_covers = set()
    title_map = {}
-    ans = {}
+    lm_map = {}
-    count = 0
+    ans = set()
    all_failed = True
-    '''
+    aborted = False
-    # Test apply dialog
+    count = 0
-    all_failed = do_identify = covers = False
+
-    '''
+    for ids in batches:
    for i, mi in izip(ids, metadata):
        if abort.is_set():
            log.error('Aborting...')
            break
-        title, authors, identifiers = mi.title, mi.authors, mi.identifiers
+        metadata = {i:db.get_metadata(i, index_is_id=True,
-        title_map[i] = title
+            get_user_categories=False) for i in ids}
-        if do_identify:
+        for i in ids:
-            results = []
+            title_map[i] = metadata[i].title
-            try:
+            lm_map[i] = metadata[i].last_modified
-                results = identify(log, Event(), title=title, authors=authors,
+        metadata = {i:metadata_to_opf(mi, default_lang='und') for i, mi in
-                    identifiers=identifiers)
+                metadata.iteritems()}
-            except:
+        try:
-                pass
+            ret = fork_job('calibre.ebooks.metadata.sources.worker', 'main',
-            if results:
+                    (do_identify, covers, metadata, ensure_fields),
-                all_failed = False
+                    cwd=tdir, abort=abort, heartbeat=heartbeat, no_output=True)
-                mi = merge_result(mi, results[0], ensure_fields=ensure_fields)
+        except WorkerError as e:
-                identifiers = mi.identifiers
+            if e.orig_tb:
-                if not mi.is_null('rating'):
+                raise Exception('Failed to download metadata. Original '
-                    # set_metadata expects a rating out of 10
+                        'traceback: \n\n'+e.orig_tb)
-                    mi.rating *= 2
+            raise
-            else:
+        count += batch_size
                log.error('Failed to download metadata for', title)
                failed_ids.add(i)
                # We don't want set_metadata operating on anything but covers
                mi = merge_result(mi, mi, ensure_fields=ensure_fields)
        if covers:
            cdata = download_cover(log, title=title, authors=authors,
                    identifiers=identifiers)
            if cdata is not None:
                with PersistentTemporaryFile('.jpg', 'downloaded-cover-') as f:
                    f.write(cdata[-1])
                    mi.cover = f.name
                all_failed = False
            else:
                failed_covers.add(i)
        ans[i] = mi
        count += 1
        notifications.put((count/len(ids),
-            _('Downloaded %(num)d of %(tot)d')%dict(num=count, tot=len(ids))))
+            _('Downloaded %(num)d of %(tot)d')%dict(
                num=count, tot=len(all_ids))))
        fids, fcovs, allf = ret['result']
        if not allf:
            all_failed = False
        failed_ids = failed_ids.union(fids)
        failed_covers = failed_covers.union(fcovs)
        ans = ans.union(set(ids) - fids)
        for book_id in ids:
            lp = os.path.join(tdir, '%d.log'%book_id)
            if os.path.exists(lp):
                with open(tf, 'ab') as dest, open(lp, 'rb') as src:
                    dest.write(('\n'+'#'*20 + ' Log for %s '%title_map[book_id] +
                        '#'*20+'\n').encode('utf-8'))
                    shutil.copyfileobj(src, dest)
    if abort.is_set():
        aborted = True
    log('Download complete, with %d failures'%len(failed_ids))
-    return (ans, failed_ids, failed_covers, title_map, all_failed)
+    return (aborted, ans, tdir, tf, failed_ids, failed_covers, title_map,
-
+            lm_map, all_failed)
--- a/src/calibre/gui2/metadata/single.py
+++ b/src/calibre/gui2/metadata/single.py
@ -161,10 +161,10 @@ class MetadataSingleDialogBase(ResizableDialog):
        self.manage_authors_button.clicked.connect(self.authors.manage_authors)
        self.series = SeriesEdit(self)
-        self.remove_unused_series_button = QToolButton(self)
+        self.clear_series_button = QToolButton(self)
-        self.remove_unused_series_button.setToolTip(
+        self.clear_series_button.setToolTip(
-               _('Remove unused series (Series that have no books)') )
+               _('Clear series') )
-        self.remove_unused_series_button.clicked.connect(self.remove_unused_series)
+        self.clear_series_button.clicked.connect(self.series.clear)
        self.series_index = SeriesIndexEdit(self, self.series)
        self.basic_metadata_widgets.extend([self.series, self.series_index])
@ -198,6 +198,7 @@ class MetadataSingleDialogBase(ResizableDialog):
        self.basic_metadata_widgets.append(self.identifiers)
        self.clear_identifiers_button = QToolButton(self)
        self.clear_identifiers_button.setIcon(QIcon(I('trash.png')))
        self.clear_identifiers_button.setToolTip(_('Clear Ids'))
        self.clear_identifiers_button.clicked.connect(self.identifiers.clear)
        self.paste_isbn_button = QToolButton(self)
        self.paste_isbn_button.setToolTip('<p>' +
@ -303,17 +304,6 @@ class MetadataSingleDialogBase(ResizableDialog):
        self.title_sort.auto_generate()
        self.author_sort.auto_generate()
    def remove_unused_series(self, *args):
        self.db.remove_unused_series()
        idx = self.series.current_val
        self.series.clear()
        self.series.initialize(self.db, self.book_id)
        if idx:
            for i in range(self.series.count()):
                if unicode(self.series.itemText(i)) == idx:
                    self.series.setCurrentIndex(i)
                    break
    def tags_editor(self, *args):
        self.tags.edit(self.db, self.book_id)
@ -591,7 +581,7 @@ class MetadataSingleDialog(MetadataSingleDialogBase): # {{{
        sto(self.title_sort, self.authors)
        create_row(1, self.authors, self.deduce_author_sort_button, self.author_sort)
        sto(self.author_sort, self.series)
-        create_row(2, self.series, self.remove_unused_series_button,
+        create_row(2, self.series, self.clear_series_button,
                self.series_index, icon='trash.png')
        sto(self.series_index, self.swap_title_author_button)
        sto(self.swap_title_author_button, self.manage_authors_button)
@ -756,7 +746,7 @@ class MetadataSingleDialogAlt1(MetadataSingleDialogBase): # {{{
                   span=2, icon='auto_author_sort.png')
        create_row(3, self.author_sort, self.series)
        create_row(4, self.series, self.series_index,
-                   button=self.remove_unused_series_button, icon='trash.png')
+                   button=self.clear_series_button, icon='trash.png')
        create_row(5, self.series_index, self.tags)
        create_row(6, self.tags, self.rating, button=self.tags_editor_button)
        create_row(7, self.rating, self.pubdate)
@ -892,7 +882,7 @@ class MetadataSingleDialogAlt2(MetadataSingleDialogBase): # {{{
                   span=2, icon='auto_author_sort.png')
        create_row(3, self.author_sort, self.series)
        create_row(4, self.series, self.series_index,
-                   button=self.remove_unused_series_button, icon='trash.png')
+                   button=self.clear_series_button, icon='trash.png')
        create_row(5, self.series_index, self.tags)
        create_row(6, self.tags, self.rating, button=self.tags_editor_button)
        create_row(7, self.rating, self.pubdate)
--- a/src/calibre/gui2/preferences/adding.py
+++ b/src/calibre/gui2/preferences/adding.py
@ -36,6 +36,7 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
        r('new_book_tags', prefs, setting=CommaSeparatedList)
        r('auto_add_path', gprefs, restart_required=True)
        r('auto_add_check_for_duplicates', gprefs)
        r('auto_add_auto_convert', gprefs)
        self.filename_pattern = FilenamePattern(self)
        self.metadata_box.layout().insertWidget(0, self.filename_pattern)
--- a/src/calibre/gui2/preferences/adding.ui
+++ b/src/calibre/gui2/preferences/adding.ui
@ -151,6 +151,19 @@ Author matching is exact.</string>
       <string>&amp;Automatic Adding</string>
      </attribute>
      <layout class="QGridLayout" name="gridLayout_3">
       <item row="3" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_auto_add_check_for_duplicates">
         <property name="toolTip">
          <string>If set, this option will causes calibre to check if a file
 being auto-added is already in the calibre library.
 If it is, a meesage will pop up asking you whether
 you want to add it anyway.</string>
         </property>
         <property name="text">
          <string>Check for &amp;duplicates when auto-adding files</string>
         </property>
        </widget>
       </item>
       <item row="0" column="0" colspan="2">
        <widget class="QLabel" name="label">
         <property name="text">
@ -168,7 +181,7 @@ Author matching is exact.</string>
         </property>
        </widget>
       </item>
-       <item row="4" column="0">
+       <item row="5" column="0">
        <widget class="QGroupBox" name="groupBox">
         <property name="title">
          <string>Ignore files with the following extensions when automatically adding </string>
@ -187,7 +200,7 @@ Author matching is exact.</string>
         </layout>
        </widget>
       </item>
-       <item row="4" column="1">
+       <item row="5" column="1">
        <spacer name="horizontalSpacer_2">
         <property name="orientation">
          <enum>Qt::Horizontal</enum>
@ -225,16 +238,10 @@ Author matching is exact.</string>
         </item>
        </layout>
       </item>
-       <item row="3" column="0" colspan="2">
+       <item row="4" column="0">
-        <widget class="QCheckBox" name="opt_auto_add_check_for_duplicates">
+        <widget class="QCheckBox" name="opt_auto_add_auto_convert">
         <property name="toolTip">
          <string>If set, this option will causes calibre to check if a file
 being auto-added is already in the calibre library.
 If it is, a meesage will pop up asking you whether
 you want to add it anyway.</string>
         </property>
         <property name="text">
-          <string>Check for &amp;duplicates when auto-adding files</string>
+          <string>Automatically &amp;convert added files to the current output format</string>
         </property>
        </widget>
       </item>
--- a/src/calibre/gui2/store/opensearch_store.py
+++ b/src/calibre/gui2/store/opensearch_store.py
@ -73,11 +73,13 @@ class OpenSearchOPDSStore(StorePlugin):
                    type = link.get('type')
                    if rel and href and type:
-                        if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
+                        if 'http://opds-spec.org/thumbnail' in rel:
                            s.cover_url = href
-                        elif rel == u'http://opds-spec.org/acquisition/buy':
+                        elif 'http://opds-spec.org/image/thumbnail' in rel:
                            s.cover_url = href
                        elif 'http://opds-spec.org/acquisition/buy' in rel:
                            s.detail_item = href
-                        elif rel == u'http://opds-spec.org/acquisition':
+                        elif 'http://opds-spec.org/acquisition' in rel:
                            if type:
                                ext = mimetypes.guess_extension(type)
                                if ext:
--- a/src/calibre/gui2/tools.py
+++ b/src/calibre/gui2/tools.py
@ -25,7 +25,7 @@ from calibre.ebooks.conversion.config import GuiRecommendations, \
 from calibre.gui2.convert import bulk_defaults_for_input_format
 def convert_single_ebook(parent, db, book_ids, auto_conversion=False, # {{{
-        out_format=None):
+        out_format=None, show_no_format_warning=True):
    changed = False
    jobs = []
    bad = []
@ -91,7 +91,7 @@ def convert_single_ebook(parent, db, book_ids, auto_conversion=False, # {{{
        except NoSupportedInputFormats:
            bad.append(book_id)
-    if bad != []:
+    if bad and show_no_format_warning:
        res = []
        for id in bad:
            title = db.title(id, True)
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@ -3243,7 +3243,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
        return id
-    def add_books(self, paths, formats, metadata, add_duplicates=True):
+    def add_books(self, paths, formats, metadata, add_duplicates=True,
            return_ids=False):
        '''
        Add a book to the database. The result cache is not updated.
        :param:`paths` List of paths to book files or file-like objects
@ -3289,7 +3290,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
            formats  = list(duplicate[1] for duplicate in duplicates)
            metadata = list(duplicate[2] for duplicate in duplicates)
            return (paths, formats, metadata), len(ids)
-        return None, len(ids)
+        return None, (ids if return_ids else len(ids))
    def import_book(self, mi, formats, notify=True, import_hooks=True,
            apply_import_tags=True, preserve_uuid=False):
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -648,7 +648,10 @@ class BasicNewsRecipe(Recipe):
            'url'         : URL of print version,
            'date'        : The publication date of the article as a string,
            'description' : A summary of the article
-            'content'     : The full article (can be an empty string). This is used by FullContentProfile
+            'content'     : The full article (can be an empty string). Obsolete
                            do not use, instead save the content to a temporary
                            file and pass a file:///path/to/temp/file.html as
                            the URL.
            }
        For an example, see the recipe for downloading `The Atlantic`.