0.8.45+

2025-07-09 03:04:10 -04:00 · 2012-04-02 04:11:11 -06:00 · 2012-04-02 04:11:11 -06:00 · bf2850019e
commit bf2850019e
parent 4eaf86efcc d24e8e842e
26 changed files with 776 additions and 337 deletions
--- a/recipes/high_country_news.recipe
+++ b/recipes/high_country_news.recipe
@ -13,7 +13,7 @@ class HighCountryNews(BasicNewsRecipe):
    __author__            = 'Armin Geller' # 2012-01-31
    publisher             = 'High Country News'
    timefmt               = ' [%a, %d %b %Y]'
-    language              = 'en-Us'
+    language              = 'en'
    encoding              = 'UTF-8'
    publication_type      = 'newspaper'
    oldest_article        = 7
--- a/recipes/oreilly_premium.recipe
+++ b/recipes/oreilly_premium.recipe
@ -1,45 +1,73 @@
-# Talking Points is not grabbing everything.
-# The look is right, but only the last one added?
-import re
+import string, re
 import time
+import traceback
+# above for debugging via stack
 from calibre.web.feeds.recipes import BasicNewsRecipe
 # Allows the Python soup converter, which makes parsing easier.
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
-# strip ads and graphics
-# Current Column lacks a title.
-# Talking Points Memo - shorten title - Remove year and Bill's name
+
+import os, time, traceback, re, urlparse, sys, cStringIO
+from collections import defaultdict
+from functools import partial
+from contextlib import nested, closing
+
+
+from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
+from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
+
+
+# To Do: strip ads and graphics, Current Column lacks a title.
 # The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
 # Newsletters: Talking Points Memos covered by cat12 
+# ./ebook-convert  --username xxx --password xxx

+# this is derived from BasicNewsRecipe, so it can only overload those.  
+# Soome of what we need is otherwise in article, so we have more copy to do than otherwise.
 class OReillyPremium(BasicNewsRecipe):
    title           = u'OReilly Premium'
    __author__      = 'TMcN'
-    language = 'en'
    description     = 'Retrieves Premium and News Letter content from BillOReilly.com.  Requires a Bill OReilly Premium Membership.'
    cover_url       = 'http://images.billoreilly.com/images/headers/billgray_header.png'
+    custom_title    = 'Bill O\'Reilly Premium - '+ time.strftime('%d %b %Y')
+    title           = 'Bill O\'Reilly Premium'
    auto_cleanup    = True
+    conversion_options = {'linearize_tables': True}
    encoding        = 'utf8'
-    needs_subscription = True
+    language        = 'en'
    no_stylesheets  = True
-    oldest_article  = 20
+    needs_subscription = True
+    oldest_article  = 31
    remove_javascript = True
    remove_tags     = [dict(name='img', attrs={})]
    # Don't go down
    recursions      = 0
-    max_articles_per_feed = 2000
+    max_articles_per_feed = 20
    
    debugMessages   = True
    
    # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
    catList = [ ["TV Archives",         'https://www.billoreilly.com/show?action=tvShowArchive', 'a',    {'class':['showLinks','homeLinks']},                   []],
-                ["No Spin Archives",    'https://www.billoreilly.com/blog?categoryID=7',         True,   {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
-                ["Daily Briefings",     'http://www.billoreilly.com/blog?categoryID=11',         True,   {'class':['defaultHeaderSmallLinks']},                 []],
-                ["Stratfor",            'http://www.billoreilly.com/blog?categoryID=5',          'a',    {'class':['blogLinks']},                               []],
-                ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12',        'td',   {},                                                    []],
+                # ["No Spin Archives",    'https://www.billoreilly.com/blog?categoryID=7',         True,   {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
+                # ["Daily Briefings",     'http://www.billoreilly.com/blog?categoryID=11',         True,   {'class':['defaultHeaderSmallLinks']},                 []],
+                # ["Stratfor",            'http://www.billoreilly.com/blog?categoryID=5',          'a',    {'class':['blogLinks']},                               []],
+                # ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12',        'td',   {},                                                    []],
                ["Current Column",      'https://www.billoreilly.com/currentcolumn',             'span', {'class':['defaultHeader']},                           []]
              ]
              
+    feeds          = [
+        (u'No Spin',        u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'),
+        (u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'), 
+        (u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'),
+        (u'Blog',           u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'),
+        (u'StratFor',       u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5')
+    ]
+    # http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day.              
+              
+    # Note: Talking Points is broken in the above model; the site changed to more Ajax-y.
+    # Now using RSS
+    
    def get_browser(self):
+        print("In get_browser")
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
@ -66,6 +94,7 @@ class OReillyPremium(BasicNewsRecipe):
    def stripBadChars(self, inString) :
        return inString.replace("\'", "")
        
+       
    def parseGeneric(self, baseURL):
        # Does a generic parsing of the articles.  There are six categories (0-5) 
        # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
@ -73,6 +102,7 @@ class OReillyPremium(BasicNewsRecipe):
        fullReturn = []
        for i in range(len(self.catList)) : 
            articleList = []
+            print("In "+self.catList[i][0]+", index: "+ str(i))
            soup = self.index_to_soup(self.catList[i][1])
            # Set defaults
            description = 'None'
@ -81,14 +111,12 @@ class OReillyPremium(BasicNewsRecipe):
            # 3-5 create one.
            # So no for-div for 3-5
            
-            if i < 3 :
+            if i == 0 :
+                print("Starting TV Archives")
                for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
+                     print("Next DIV:")
                     print(div)
-                     if i == 1:
-                        a = div.find('a', href=True)
-                     else :
-                        a = div
-                     print(a)
+                     a = div
                     summary = div.find(True, attrs={'class':'summary'})
                     if summary:
                         description = self.tag_to_string(summary, use_alt=False)
@ -96,82 +124,63 @@ class OReillyPremium(BasicNewsRecipe):
                         continue
                     # url = baseURL+re.sub(r'\?.*', '', a['href'])
                     url = baseURL+a['href']
-                     if i < 2 :
-                        url = self.extractPrintURL(baseURL, url, "Print this entry")
-                        title = self.tag_to_string(a, use_alt=True).strip()
-                     elif i == 2 :
-                        # Daily Briefs
-                        url = self.extractPrintURL(baseURL, url, "Print this entry")
-                        title =  div.contents[0]
-                     if self.debugMessages :
-                        print(title+" @ "+url)
+                     url = self.extractPrintURL(baseURL, url, "Print this entry")
+                     title = self.tag_to_string(a, use_alt=True).strip()
                     articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))

-            elif i == 3 :   # Stratfor
-                a = soup.find('a', self.catList[i][3])
-                if a is None :
-                    continue
-                url = baseURL+a['href']
-                title = self.tag_to_string(a, use_alt=True).strip()
-                # Get Stratfor contents so we can get the real title.
-                stratSoup = self.index_to_soup(url)
-                title = stratSoup.html.head.title.string
-                stratIndex = title.find('Stratfor.com:', 0)
-                if (stratIndex > -1) :
-                    title = title[stratIndex+14:-1]
-                # Look for first blogBody  <td class="blogBody"
-                # Changed 12 Jan 2012 - new page format
-                #stratBlogTable = stratSoup.find('td', {'class':['blogBody']}).findParent('table')
-                #stratBody = stratSoup.find('td', {'class':['blogBody']})
-            elif i == 4 :      # Talking Points
-                topDate =  soup.find("td", "blogBody")
-                if not topDate :
-                    print("Failed to find date in Talking Points")
-                # This page has the contents in double-wrapped tables!
-                myTable = topDate.findParents('table')[0]
-                if myTable is not None:
-                    upOneTable = myTable.findParents('table')[0]
-                    if upOneTable is not None:
-                        upTwo = upOneTable.findParents('table')[0]
-                if upTwo is None:
-                    continue
-                # Now navigate rows of upTwo
-                if self.debugMessages :
-                    print("Entering rows")
-                for rows in upTwo.findChildren("tr", recursive=False):
-                    # Inside top level table, each row is an article
-                    rowTable = rows.find("table")
-                    articleTable = rowTable.find("table")
-                    # This looks wrong.
-                    articleTable = rows.find("tr")
-                    # The middle table is just for formatting the article buffer... but this means we can skip the inner table.
-                    blogDate = articleTable.find("a","blogDate").contents[0]
-                    # Skip to second blogBody for this.
-                    blogTitle = articleTable.findAll("td", "blogBody")[1].contents[0]
-                    blogURL = articleTable.find("a", "homeBlogReadMore bold")['href']
-                    url = baseURL+re.sub(r'\?.*', '', blogURL)
-                    title = blogDate+": "+self.stripBadChars(blogTitle.replace("Bill O'Reilly: ", ""))
-                    if self.debugMessages :
-                        print("Talking Points Memo title "+title+" at url: "+url)
-                    pubdate = time.strftime('%a, %d %b')
-                    articleList.append(dict(title=title, url=url, date=pubdate, description='None', content=''))
            else :       # Current Column
                titleSpan = soup.find(self.catList[i][2], self.catList[i][3])
                if titleSpan is None :
+                    print("No Current Column Title Span")
+                    print(soup)
                    continue
                title = titleSpan.contents[0]
                url = self.extractPrintURL(baseURL, self.catList[i][1], "Print This Article")
-            if i == 3 or i == 5 :
+            if i == 1 :
                 if self.debugMessages :
                    print(self.catList[i][0]+" Title:"+title+" at url: "+url)
                 summary = div.find(True, attrs={'class':'summary'})
-                 if summary:
+                 print("At Summary")
+                 print(summary)
+                 if summary is not None:
                     description = self.tag_to_string(summary, use_alt=False)
+                 print("At append")
                 articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
            self.catList[i][3] = articleList
            fullReturn.append((self.catList[i][0], articleList))
+            print("Returning")
+            # print fullReturn
        return fullReturn
     
+
+    # build_index() starts with:
+    #     try:
+    #        feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
+    #                                 max_articles_per_feed=self.max_articles_per_feed,
+    #                                 log=self.log)
+    #        self.report_progress(0, _('Got feeds from index page'))
+    #    except NotImplementedError:
+    #        feeds = self.parse_feeds()
+    
+    # which in turn is from __init__.py
+    #def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100,
+    #    log=default_log):
+    #'''
+    #@param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}.
+    #@return: A list of L{Feed} objects.
+    #@rtype: list
+    #'''
+    #feeds = []
+    #for title, articles in index:
+    #    pfeed = Feed(log=log)
+    #    pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
+    #                                   max_articles_per_feed=max_articles_per_feed)
+    #    feeds.append(pfeed)
+    #           return feeds
+    
+    #  use_embedded_content defaults to None, at which point if the content is > 2K, it is used as the article.
+
+            
    # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
    # returns a list of tuple ('feed title', list of articles)
    # {
@ -182,12 +191,19 @@ class OReillyPremium(BasicNewsRecipe):
    # 'content'     : The full article (can be an empty string). This is used by FullContentProfile
    # }
    # this is used instead of BasicNewsRecipe.parse_feeds().
+    # it is called by download
    def parse_index(self):
        # Parse the page into Python Soup
+        print("Entering recipe print_index from:")
+        traceback.print_stack()
+        print("web")
        baseURL = "https://www.billoreilly.com"
-        return self.parseGeneric(baseURL)
+        masterList = self.parseGeneric(baseURL)
+        #print(masterList)
+        return masterList
        
    def preprocess_html(self, soup):
+        print("In preprocess_html")
        refresh = soup.find('meta', {'http-equiv':'refresh'})
        if refresh is None:
            return soup
@ -195,3 +211,128 @@ class OReillyPremium(BasicNewsRecipe):
        raw = self.browser.open('https://www.billoreilly.com'+content).read()
        return BeautifulSoup(raw.decode('cp1252', 'replace'))
    
+    def build_index(self):
+        print("In OReilly build_index()\n\n")
+        feedsRSS = []
+        self.report_progress(0, _('Fetching feeds...'))
+        #try:
+        feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
+                                 max_articles_per_feed=self.max_articles_per_feed,
+                                 log=self.log)
+        self.report_progress(0, _('Got feeds from index page'))
+        #except NotImplementedError:
+        #    feeds = self.parse_feeds()
+        # Now add regular feeds.    
+        feedsRSS = self.parse_feeds()
+        print ("feedsRSS is type "+feedsRSS.__class__.__name__)
+        
+        for articles in feedsRSS:
+            print("articles is type "+articles.__class__.__name__)
+            print("Title:" + articles.title)
+            feeds.append(articles)
+        if not feeds:
+            raise ValueError('No articles found, aborting')
+
+        #feeds = FeedCollection(feeds)
+
+        self.report_progress(0, _('Trying to download cover...'))
+        self.download_cover()
+        self.report_progress(0, _('Generating masthead...'))
+        self.masthead_path = None
+
+        try:
+            murl = self.get_masthead_url()
+        except:
+            self.log.exception('Failed to get masthead url')
+            murl = None
+
+        if murl is not None:
+            # Try downloading the user-supplied masthead_url
+            # Failure sets self.masthead_path to None
+            self.download_masthead(murl)
+        if self.masthead_path is None:
+            self.log.info("Synthesizing mastheadImage")
+            self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg')
+            try:
+                self.default_masthead_image(self.masthead_path)
+            except:
+                self.log.exception('Failed to generate default masthead image')
+                self.masthead_path = None
+
+        if self.test:
+            feeds = feeds[:2]
+        self.has_single_feed = len(feeds) == 1
+
+        index = os.path.join(self.output_dir, 'index.html')
+
+        html = self.feeds2index(feeds)
+        with open(index, 'wb') as fi:
+            fi.write(html)
+
+        self.jobs = []
+
+        if self.reverse_article_order:
+            for feed in feeds:
+                if hasattr(feed, 'reverse'):
+                    feed.reverse()
+
+        self.feed_objects = feeds
+        for f, feed in enumerate(feeds):
+            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
+            if not os.path.isdir(feed_dir):
+                os.makedirs(feed_dir)
+
+            for a, article in enumerate(feed):
+                if a >= self.max_articles_per_feed:
+                    break
+                art_dir = os.path.join(feed_dir, 'article_%d'%a)
+                if not os.path.isdir(art_dir):
+                    os.makedirs(art_dir)
+                try:
+                    url = self.print_version(article.url)
+                except NotImplementedError:
+                    url = article.url
+                except:
+                    self.log.exception('Failed to find print version for: '+article.url)
+                    url = None
+                if not url:
+                    continue
+                func, arg = (self.fetch_embedded_article, article) \
+                            if self.use_embedded_content or (self.use_embedded_content == None and feed.has_embedded_content()) \
+                            else \
+                            ((self.fetch_obfuscated_article if self.articles_are_obfuscated \
+                              else self.fetch_article), url)
+                req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
+                                      {}, (f, a), self.article_downloaded,
+                                      self.error_in_article_download)
+                req.feed = feed
+                req.article = article
+                req.feed_dir = feed_dir
+                self.jobs.append(req)
+
+
+        self.jobs_done = 0
+        tp = ThreadPool(self.simultaneous_downloads)
+        for req in self.jobs:
+            tp.putRequest(req, block=True, timeout=0)
+
+
+        self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
+        while True:
+            try:
+                tp.poll()
+                time.sleep(0.1)
+            except NoResultsPending:
+                break
+        for f, feed in enumerate(feeds):
+            print("Writing feeds for "+feed.title)
+            html = self.feed2index(f,feeds)
+            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
+            with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
+                fi.write(html)
+        self.create_opf(feeds)
+        self.report_progress(1, _('Feeds downloaded to %s')%index)
+
+        return index
+    
+
--- a/recipes/real_clear.recipe
+++ b/recipes/real_clear.recipe
@ -1,7 +1,9 @@
 #  Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug
+import string, re
 import time
+from urlparse import urlparse
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import NavigableString
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString

 class RealClear(BasicNewsRecipe):
    title           = u'Real Clear'
@ -20,12 +22,13 @@ class RealClear(BasicNewsRecipe):
    # Don't go down
    recursions      = 0
    max_articles_per_feed = 400
-    debugMessages = False
+    debugMessages = True
    
    # Numeric parameter is type, controls whether we look for 
    feedsets = [
-                ["Politics",        "http://www.realclearpolitics.com/index.xml", 0],
-                ["Science",         "http://www.realclearscience.com/index.xml", 0],
+                ["Politics",        "http://www.realclearpolitics.com/index.xml",   0],
+                ["Policy",           "http://www.realclearpolicy.com/index.xml", 0],
+                ["Science",         "http://www.realclearscience.com/index.xml",    0],
                ["Tech",            "http://www.realcleartechnology.com/index.xml", 0],
                # The feedburner is essentially the same as the top feed, politics.
                # ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1],
@ -37,7 +40,9 @@ class RealClear(BasicNewsRecipe):
            ]
    # Hints to extractPrintURL.
    # First column is the URL snippet.  Then the string to search for as text, and the attributes to look for above it.  Start with attributes and drill down.
-    printhints = [
+    phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
+    
+    printhints = [  ["realclear",           "",                            '' , 'printpage'],
                    ["billoreilly.com",     "Print this entry",            'a', ''],
                    ["billoreilly.com",     "Print This Article",          'a', ''],
                    ["politico.com",        "Print",                       'a', 'share-print'],    
@ -48,11 +53,24 @@ class RealClear(BasicNewsRecipe):
                    # usatoday - just prints with all current crap anyhow
            
            ]
+     # RCP - look for a strange compound.  See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html
+     # The print link isn't obvious, and only the end is needed (the -full append.)  SO maybe try that first?s
+     # http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
+     # Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
+     # Use the FULL PRINTPAGE URL; it formats it better too!
+     #
+     # NYT - try single page...
+     # Need special code - is it one page or several?  Which URL?
+     # from  http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
+     # to    http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
+     # which is at link rel="canonical"   and at        <meta property="og:url"    or look for "Single Page"
     
    # Returns the best-guess print url.
    # The second parameter (pageURL) is returned if nothing is found.
    def extractPrintURL(self, pageURL):
        tagURL = pageURL
+        baseParse = urlparse(pageURL)
+        baseURL = baseParse[0]+"://"+baseParse[1]
        hintsCount =len(self.printhints)
        for x in range(0,hintsCount):
            if pageURL.find(self.printhints[x][0])== -1 :
@ -62,23 +80,37 @@ class RealClear(BasicNewsRecipe):
            soup = self.index_to_soup(pageURL)
            if soup is None:
                return pageURL
-            if len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
+            if  len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0:
+                # e.g. RealClear
                if self.debugMessages == True :
-                    print("search1")
+                    print("Search by href: "+self.printhints[x][self.phHrefSearch])
+                printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch]))
+            elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
+                if self.debugMessages == True :
+                    print("Search 1: "+self.printhints[x][2]+" Attributes: ")
+                    print(self.printhints[x][3])
                printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
            elif  len(self.printhints[x][3])>0 :
                if self.debugMessages == True :
                    print("search2")
                printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
            else :
+                if self.debugMessages == True:
+                    print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1])
                printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
            if printFind is None:
                if self.debugMessages == True :
                    print("Not Found")
+                    # print(soup)
+                    print("end soup\n\n");
                continue
+                
            print(printFind)
            if isinstance(printFind, NavigableString)==False:
                if printFind['href'] is not None:
+                    print("Check "+printFind['href']+" for base of "+baseURL)
+                    if printFind['href'].find("http")!=0 :
+                        return baseURL+printFind['href']
                    return printFind['href']
            tag = printFind.parent
            print(tag)
@ -158,6 +190,7 @@ class RealClear(BasicNewsRecipe):
    def parse_index(self):
        # Parse the page into Python Soup
        
+        articleList = []
        ans = []
        feedsCount = len(self.feedsets)
        for x in range(0,feedsCount): # should be ,4
@ -168,3 +201,4 @@ class RealClear(BasicNewsRecipe):
            print(ans)
        return ans
        
+
--- a/recipes/soldiers.recipe
+++ b/recipes/soldiers.recipe
@ -15,6 +15,8 @@ class Soldiers(BasicNewsRecipe):
    max_articles_per_feed  = 100
    no_stylesheets         = True
    use_embedded_content   = False
+    auto_cleanup = True
+    auto_cleanup_keep = '//div[@id="mediaWrapper"]'
    simultaneous_downloads = 1
    delay                  = 4
    max_connections        = 1    
@ -31,14 +33,14 @@ class Soldiers(BasicNewsRecipe):
                        , 'language'         : language
                        }

-    keep_only_tags = [dict(name='div', attrs={'id':['storyHeader','textArea']})]
+    #keep_only_tags = [dict(name='div', attrs={'id':['storyHeader','textArea']})]
                     
-    remove_tags = [
-                     dict(name='div', attrs={'id':['addThis','comment','articleFooter']})
-                    ,dict(name=['object','link'])
-                  ]
+    #remove_tags = [
+                     #dict(name='div', attrs={'id':['addThis','comment','articleFooter']})
+                    #,dict(name=['object','link'])
+                  #]
                            
-    feeds = [(u'Frontpage', u'http://www.army.mil/rss/feeds/soldiersfrontpage.xml' )]
+    feeds = [(u'Frontpage', u'http://www.army.mil/rss/2/' )]


    def get_cover_url(self):
--- a/setup/installer/windows/freeze.py
+++ b/setup/installer/windows/freeze.py
@ -14,7 +14,7 @@ from setup.build_environment import msvc, MT, RC
 from setup.installer.windows.wix import WixMixIn

 OPENSSL_DIR = r'Q:\openssl'
-QT_DIR = 'Q:\\Qt\\4.8.0'
+QT_DIR = 'Q:\\Qt\\4.8.1'
 QT_DLLS = ['Core', 'Gui', 'Network', 'Svg', 'WebKit', 'Xml', 'XmlPatterns']
 LIBUNRAR         = 'C:\\Program Files\\UnrarDLL\\unrar.dll'
 SW               = r'C:\cygwin\home\kovid\sw'
--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@ -107,6 +107,7 @@ class ANDROID(USBMS):
                    0xc004 : [0x0226],
                    0x8801 : [0x0226, 0x0227],
                    0xe115 : [0x0216], # PocketBook A10
+                    0xe107 : [0x326], # PocketBook 622
            },

            # Acer
--- a/src/calibre/ebooks/metadata/sources/worker.py
+++ b/src/calibre/ebooks/metadata/sources/worker.py
@ -0,0 +1,95 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os
+from threading import Event
+from io import BytesIO
+
+from calibre.utils.date import as_utc
+from calibre.ebooks.metadata.sources.identify import identify, msprefs
+from calibre.ebooks.metadata.book.base import Metadata
+from calibre.customize.ui import metadata_plugins
+from calibre.ebooks.metadata.sources.covers import download_cover
+from calibre.utils.logging import GUILog
+from calibre.ebooks.metadata.opf2 import metadata_to_opf, OPF
+
+def merge_result(oldmi, newmi, ensure_fields=None):
+    dummy = Metadata(_('Unknown'))
+    for f in msprefs['ignore_fields']:
+        if ':' in f or (ensure_fields and f in ensure_fields):
+            continue
+        setattr(newmi, f, getattr(dummy, f))
+    fields = set()
+    for plugin in metadata_plugins(['identify']):
+        fields |= plugin.touched_fields
+
+    def is_equal(x, y):
+        if hasattr(x, 'tzinfo'):
+            x = as_utc(x)
+        if hasattr(y, 'tzinfo'):
+            y = as_utc(y)
+        return x == y
+
+    for f in fields:
+        # Optimize so that set_metadata does not have to do extra work later
+        if not f.startswith('identifier:'):
+            if (not newmi.is_null(f) and is_equal(getattr(newmi, f),
+                    getattr(oldmi, f))):
+                setattr(newmi, f, getattr(dummy, f))
+
+    return newmi
+
+def main(do_identify, covers, metadata, ensure_fields):
+    failed_ids = set()
+    failed_covers = set()
+    all_failed = True
+    log = GUILog()
+
+    for book_id, mi in metadata.iteritems():
+        mi = OPF(BytesIO(mi), basedir=os.getcwdu(),
+                populate_spine=False).to_book_metadata()
+        title, authors, identifiers = mi.title, mi.authors, mi.identifiers
+        cdata = None
+        log.clear()
+
+        if do_identify:
+            results = []
+            try:
+                results = identify(log, Event(), title=title, authors=authors,
+                    identifiers=identifiers)
+            except:
+                pass
+            if results:
+                all_failed = False
+                mi = merge_result(mi, results[0], ensure_fields=ensure_fields)
+                identifiers = mi.identifiers
+                if not mi.is_null('rating'):
+                    # set_metadata expects a rating out of 10
+                    mi.rating *= 2
+                with open('%d.mi'%book_id, 'wb') as f:
+                    f.write(metadata_to_opf(mi, default_lang='und'))
+            else:
+                log.error('Failed to download metadata for', title)
+                failed_ids.add(book_id)
+
+        if covers:
+            cdata = download_cover(log, title=title, authors=authors,
+                    identifiers=identifiers)
+            if cdata is None:
+                failed_covers.add(book_id)
+            else:
+                with open('%d.cover'%book_id, 'wb') as f:
+                    f.write(cdata[-1])
+                all_failed = False
+
+        with open('%d.log'%book_id, 'wb') as f:
+            f.write(log.plain_text.encode('utf-8'))
+
+    return failed_ids, failed_covers, all_failed
+
--- a/src/calibre/ebooks/mobi/reader/mobi8.py
+++ b/src/calibre/ebooks/mobi/reader/mobi8.py
@ -10,13 +10,19 @@ __docformat__ = 'restructuredtext en'
 import struct, re, os, imghdr
 from collections import namedtuple
 from itertools import repeat
+from urlparse import urldefrag
+
+from lxml import etree

 from calibre.ebooks.mobi.reader.headers import NULL_INDEX
 from calibre.ebooks.mobi.reader.index import read_index
 from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc
 from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup
 from calibre.ebooks.metadata.opf2 import Guide, OPFCreator
+from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.mobi.utils import read_font_record
+from calibre.ebooks.oeb.parse_utils import parse_html
+from calibre.ebooks.oeb.base import XPath, XHTML, xml2text

 Part = namedtuple('Part',
    'num type filename start end aid')
@ -383,6 +389,19 @@ class Mobi8Reader(object):
                len(resource_map)):
            mi.cover = resource_map[self.cover_offset]

+        if len(list(toc)) < 2:
+            self.log.warn('KF8 has no metadata Table of Contents')
+
+            for ref in guide:
+                if ref.type == 'toc':
+                    href = ref.href()
+                    href, frag = urldefrag(href)
+                    if os.path.exists(href.replace('/', os.sep)):
+                        try:
+                            toc = self.read_inline_toc(href, frag)
+                        except:
+                            self.log.exception('Failed to read inline ToC')
+
        opf = OPFCreator(os.getcwdu(), mi)
        opf.guide = guide

@ -397,4 +416,70 @@ class Mobi8Reader(object):
            opf.render(of, ncx, 'toc.ncx')
        return 'metadata.opf'

+    def read_inline_toc(self, href, frag):
+        ans = TOC()
+        base_href = '/'.join(href.split('/')[:-1])
+        with open(href.replace('/', os.sep), 'rb') as f:
+            raw = f.read().decode(self.header.codec)
+        root = parse_html(raw, log=self.log)
+        body = XPath('//h:body')(root)
+        reached = False
+        if body:
+            start = body[0]
+        else:
+            start = None
+            reached = True
+        if frag:
+            elems = XPath('//*[@id="%s"]'%frag)
+            if elems:
+                start = elems[0]
+
+        def node_depth(elem):
+            ans = 0
+            parent = elem.getparent()
+            while parent is not None:
+                parent = parent.getparent()
+                ans += 1
+            return ans
+
+        # Layer the ToC based on nesting order in the source HTML
+        current_depth = None
+        parent = ans
+        seen = set()
+        links = []
+        for elem in root.iterdescendants(etree.Element):
+            if reached and elem.tag == XHTML('a') and elem.get('href',
+                    False):
+                href = elem.get('href')
+                href, frag = urldefrag(href)
+                href = base_href + '/' + href
+                text = xml2text(elem).strip()
+                if (text, href, frag) in seen:
+                    continue
+                seen.add((text, href, frag))
+                links.append((text, href, frag, node_depth(elem)))
+            elif elem is start:
+                reached = True
+
+        depths = sorted(set(x[-1] for x in links))
+        depth_map = {x:i for i, x in enumerate(depths)}
+        for text, href, frag, depth in links:
+            depth = depth_map[depth]
+            if current_depth is None:
+                current_depth = 0
+                parent.add_item(href, frag, text)
+            elif current_depth == depth:
+                parent.add_item(href, frag, text)
+            elif current_depth < depth:
+                parent = parent[-1] if len(parent) > 0 else parent
+                parent.add_item(href, frag, text)
+                current_depth += 1
+            else:
+                delta = current_depth - depth
+                while delta > 0 and parent.parent is not None:
+                    parent = parent.parent
+                    delta -= 1
+                parent.add_item(href, frag, text)
+                current_depth = depth
+        return ans

--- a/src/calibre/ebooks/pdf/writer.py
+++ b/src/calibre/ebooks/pdf/writer.py
@ -40,27 +40,34 @@ def get_custom_size(opts):
                custom_size = None
    return custom_size

-def get_pdf_printer(opts, for_comic=False):
+def get_pdf_printer(opts, for_comic=False, output_file_name=None):
    from calibre.gui2 import is_ok_to_use_qt
    if not is_ok_to_use_qt():
        raise Exception('Not OK to use Qt')

    printer = QPrinter(QPrinter.HighResolution)
    custom_size = get_custom_size(opts)
-
-    if opts.output_profile.short_name == 'default' or \
-            opts.output_profile.width > 9999:
-        if custom_size is None:
-            printer.setPaperSize(paper_size(opts.paper_size))
-        else:
-            printer.setPaperSize(QSizeF(custom_size[0], custom_size[1]), unit(opts.unit))
+    if isosx and not for_comic:
+        # On OSX, the native engine can only produce a single page size
+        # (usually A4). The Qt engine on the other hand produces image based
+        # PDFs. If we set a custom page size using QSizeF the native engine
+        # produces unreadable output, so we just ignore the custom size
+        # settings.
+        printer.setPaperSize(paper_size(opts.paper_size))
    else:
-        w = opts.output_profile.comic_screen_size[0] if for_comic else \
-                opts.output_profile.width
-        h = opts.output_profile.comic_screen_size[1] if for_comic else \
-                opts.output_profile.height
-        dpi = opts.output_profile.dpi
-        printer.setPaperSize(QSizeF(float(w) / dpi, float(h) / dpi), QPrinter.Inch)
+        if opts.output_profile.short_name == 'default' or \
+                opts.output_profile.width > 9999:
+            if custom_size is None:
+                printer.setPaperSize(paper_size(opts.paper_size))
+            else:
+                printer.setPaperSize(QSizeF(custom_size[0], custom_size[1]), unit(opts.unit))
+        else:
+            w = opts.output_profile.comic_screen_size[0] if for_comic else \
+                    opts.output_profile.width
+            h = opts.output_profile.comic_screen_size[1] if for_comic else \
+                    opts.output_profile.height
+            dpi = opts.output_profile.dpi
+            printer.setPaperSize(QSizeF(float(w) / dpi, float(h) / dpi), QPrinter.Inch)

    if for_comic:
        # Comic pages typically have their own margins, or their background
@ -72,6 +79,12 @@ def get_pdf_printer(opts, for_comic=False):
    printer.setOrientation(orientation(opts.orientation))
    printer.setOutputFormat(QPrinter.PdfFormat)
    printer.setFullPage(for_comic)
+    if output_file_name:
+        printer.setOutputFileName(output_file_name)
+    if isosx and not for_comic:
+        # Ensure we are not generating enormous image based PDFs
+        printer.setOutputFormat(QPrinter.NativeFormat)
+
    return printer

 def get_printer_page_size(opts, for_comic=False):
@ -163,15 +176,7 @@ class PDFWriter(QObject): # {{{
        if ok:
            item_path = os.path.join(self.tmp_path, '%i.pdf' % len(self.combine_queue))
            self.logger.debug('\tRendering item %s as %i.pdf' % (os.path.basename(str(self.view.url().toLocalFile())), len(self.combine_queue)))
-            printer = get_pdf_printer(self.opts)
-            printer.setOutputFileName(item_path)
-            # We have to set the engine to Native on OS X after the call to set
-            # filename. Setting a filename with .pdf as the extension causes
-            # Qt to set the format to use Qt's PDF engine even if native was
-            # previously set on the printer. Qt's PDF engine produces image
-            # based PDFs on OS X, so we cannot use it.
-            if isosx:
-                printer.setOutputFormat(QPrinter.NativeFormat)
+            printer = get_pdf_printer(self.opts, output_file_name=item_path)
            self.view.page().mainFrame().evaluateJavaScript('''
                document.body.style.backgroundColor = "white";

@ -193,10 +198,7 @@ class PDFWriter(QObject): # {{{
        if self.cover_data is None:
            return
        item_path = os.path.join(self.tmp_path, 'cover.pdf')
-        printer = get_pdf_printer(self.opts)
-        printer.setOutputFileName(item_path)
-        if isosx:
-            printer.setOutputFormat(QPrinter.NativeFormat)
+        printer = get_pdf_printer(self.opts, output_file_name=item_path)
        self.combine_queue.insert(0, item_path)
        p = QPixmap()
        p.loadFromData(self.cover_data)
@ -248,10 +250,8 @@ class ImagePDFWriter(object):
            os.remove(f.name)

    def render_images(self, outpath, mi, items):
-        printer = get_pdf_printer(self.opts, for_comic=True)
-        printer.setOutputFileName(outpath)
-        if isosx:
-            printer.setOutputFormat(QPrinter.NativeFormat)
+        printer = get_pdf_printer(self.opts, for_comic=True,
+                output_file_name=outpath)
        printer.setDocName(mi.title)
        printer.setCreator(u'%s [%s]'%(__appname__, __version__))
        # Seems to be no way to set author
--- a/src/calibre/gui2/init.py
+++ b/src/calibre/gui2/init.py
@ -105,6 +105,7 @@ gprefs.defaults['show_files_after_save'] = True
 gprefs.defaults['auto_add_path'] = None
 gprefs.defaults['auto_add_check_for_duplicates'] = False
 gprefs.defaults['blocked_auto_formats'] = []
+gprefs.defaults['auto_add_auto_convert'] = True
 # }}}

 NONE = QVariant() #: Null value to return from the data function of item models
--- a/src/calibre/gui2/actions/add.py
+++ b/src/calibre/gui2/actions/add.py
@ -71,7 +71,7 @@ class AddAction(InterfaceAction):
        ma('add-formats', _('Add files to selected book records'),
                triggered=self.add_formats, shortcut=_('Shift+A'))
        self.add_menu.addSeparator()
-        ma('add-config', _('Configure the adding of books'),
+        ma('add-config', _('Control the adding of books'),
                triggered=self.add_config)

        self.qaction.triggered.connect(self.add_books)
--- a/src/calibre/gui2/actions/convert.py
+++ b/src/calibre/gui2/actions/convert.py
@ -53,6 +53,24 @@ class ConvertAction(InterfaceAction):
        self.queue_convert_jobs(jobs, changed, bad, rows, previous,
                self.book_auto_converted, extra_job_args=[on_card])

+    def auto_convert_auto_add(self, book_ids):
+        previous = self.gui.library_view.currentIndex()
+        db = self.gui.current_db
+        needed = set()
+        of = prefs['output_format'].lower()
+        for book_id in book_ids:
+            fmts = db.formats(book_id, index_is_id=True)
+            fmts = set(x.lower() for x in fmts.split(',')) if fmts else set()
+            if of not in fmts:
+                needed.add(book_id)
+        if needed:
+            jobs, changed, bad = convert_single_ebook(self.gui,
+                    self.gui.library_view.model().db, needed, True, of,
+                    show_no_format_warning=False)
+            if not jobs: return
+            self.queue_convert_jobs(jobs, changed, bad, list(needed), previous,
+                    self.book_converted, rows_are_ids=True)
+
    def auto_convert_mail(self, to, fmts, delete_from_library, book_ids, format, subject):
        previous = self.gui.library_view.currentIndex()
        rows = [x.row() for x in \
@ -118,7 +136,7 @@ class ConvertAction(InterfaceAction):
                num, 2000)

    def queue_convert_jobs(self, jobs, changed, bad, rows, previous,
-            converted_func, extra_job_args=[]):
+            converted_func, extra_job_args=[], rows_are_ids=False):
        for func, args, desc, fmt, id, temp_files in jobs:
            func, _, same_fmt = func.partition(':')
            same_fmt = same_fmt == 'same_fmt'
@ -140,7 +158,11 @@ class ConvertAction(InterfaceAction):
                self.conversion_jobs[job] = tuple(args)

        if changed:
-            self.gui.library_view.model().refresh_rows(rows)
+            m = self.gui.library_view.model()
+            if rows_are_ids:
+                m.refresh_ids(rows)
+            else:
+                m.refresh_rows(rows)
            current = self.gui.library_view.currentIndex()
            self.gui.library_view.model().current_changed(current, previous)

--- a/src/calibre/gui2/actions/edit_metadata.py
+++ b/src/calibre/gui2/actions/edit_metadata.py
@ -5,7 +5,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import os
+import os, shutil
 from functools import partial

 from PyQt4.Qt import QMenu, QModelIndex, QTimer
@ -16,6 +16,7 @@ from calibre.gui2.dialogs.confirm_delete import confirm
 from calibre.gui2.dialogs.device_category_editor import DeviceCategoryEditor
 from calibre.gui2.actions import InterfaceAction
 from calibre.ebooks.metadata import authors_to_string
+from calibre.ebooks.metadata.opf2 import OPF
 from calibre.utils.icu import sort_key
 from calibre.db.errors import NoSuchFormat

@ -79,14 +80,23 @@ class EditMetadataAction(InterfaceAction):
                Dispatcher(self.metadata_downloaded),
                ensure_fields=ensure_fields)

+    def cleanup_bulk_download(self, tdir):
+        try:
+            shutil.rmtree(tdir, ignore_errors=True)
+        except:
+            pass
+
    def metadata_downloaded(self, job):
        if job.failed:
            self.gui.job_exception(job, dialog_title=_('Failed to download metadata'))
            return
        from calibre.gui2.metadata.bulk_download import get_job_details
-        id_map, failed_ids, failed_covers, all_failed, det_msg = \
-                                            get_job_details(job)
+        (aborted, id_map, tdir, log_file, failed_ids, failed_covers, all_failed,
+                det_msg, lm_map) = get_job_details(job)
+        if aborted:
+            return self.cleanup_bulk_download(tdir)
        if all_failed:
+            self.cleanup_bulk_download(tdir)
            return error_dialog(self.gui, _('Download failed'),
            _('Failed to download metadata or covers for any of the %d'
               ' book(s).') % len(id_map), det_msg=det_msg, show=True)
@ -103,28 +113,26 @@ class EditMetadataAction(InterfaceAction):
            msg += '<p>'+_('Could not download metadata and/or covers for %d of the books. Click'
                    ' "Show details" to see which books.')%num

-        payload = (id_map, failed_ids, failed_covers)
+        payload = (id_map, tdir, log_file, lm_map)
        from calibre.gui2.dialogs.message_box import ProceedNotification
        p = ProceedNotification(self.apply_downloaded_metadata,
-                payload, job.html_details,
+                payload, log_file,
                _('Download log'), _('Download complete'), msg,
                det_msg=det_msg, show_copy_button=show_copy_button,
-                parent=self.gui)
+                parent=self.gui, log_is_file=True)
        p.show()

    def apply_downloaded_metadata(self, payload):
-        id_map, failed_ids, failed_covers = payload
-        id_map = dict([(k, v) for k, v in id_map.iteritems() if k not in
-            failed_ids])
-        if not id_map:
+        good_ids, tdir, log_file, lm_map = payload
+        if not good_ids:
            return

        modified = set()
        db = self.gui.current_db

-        for i, mi in id_map.iteritems():
+        for i in good_ids:
            lm = db.metadata_last_modified(i, index_is_id=True)
-            if lm > mi.last_modified:
+            if lm > lm_map[i]:
                title = db.title(i, index_is_id=True)
                authors = db.authors(i, index_is_id=True)
                if authors:
@ -144,7 +152,18 @@ class EditMetadataAction(InterfaceAction):
                        'Do you want to proceed?'), det_msg='\n'.join(modified)):
                return

-        self.apply_metadata_changes(id_map)
+        id_map = {}
+        for bid in good_ids:
+            opf = os.path.join(tdir, '%d.mi'%bid)
+            if not os.path.exists(opf):
+                opf = None
+            cov = os.path.join(tdir, '%d.cover'%bid)
+            if not os.path.exists(cov):
+                cov = None
+            id_map[bid] = (opf, cov)
+
+        self.apply_metadata_changes(id_map, callback=lambda x:
+                self.cleanup_bulk_download(tdir))

    # }}}

@ -468,6 +487,11 @@ class EditMetadataAction(InterfaceAction):
        callback can be either None or a function accepting a single argument,
        in which case it is called after applying is complete with the list of
        changed ids.
+
+        id_map can also be a mapping of ids to 2-tuple's where each 2-tuple
+        contains the absolute paths to an OPF and cover file respectively. If
+        either of the paths is None, then the corresponding metadata is not
+        updated.
        '''
        if title is None:
            title = _('Applying changed metadata')
@ -492,28 +516,48 @@ class EditMetadataAction(InterfaceAction):
            return self.finalize_apply()

        i, mi = self.apply_id_map[self.apply_current_idx]
+        if isinstance(mi, tuple):
+            opf, cover = mi
+            if opf:
+                mi = OPF(open(opf, 'rb'), basedir=os.path.dirname(opf),
+                        populate_spine=False).to_book_metadata()
+                self.apply_mi(i, mi)
+            if cover:
+                self.gui.current_db.set_cover(i, open(cover, 'rb'),
+                        notify=False, commit=False)
+        else:
+            self.apply_mi(i, mi)
+
+        self.apply_current_idx += 1
+        if self.apply_pd is not None:
+            self.apply_pd.value += 1
+        QTimer.singleShot(50, self.do_one_apply)
+
+
+    def apply_mi(self, book_id, mi):
        db = self.gui.current_db
+
        try:
            set_title = not mi.is_null('title')
            set_authors = not mi.is_null('authors')
-            idents = db.get_identifiers(i, index_is_id=True)
+            idents = db.get_identifiers(book_id, index_is_id=True)
            if mi.identifiers:
                idents.update(mi.identifiers)
            mi.identifiers = idents
            if mi.is_null('series'):
                mi.series_index = None
            if self._am_merge_tags:
-                old_tags = db.tags(i, index_is_id=True)
+                old_tags = db.tags(book_id, index_is_id=True)
                if old_tags:
                    tags = [x.strip() for x in old_tags.split(',')] + (
                            mi.tags if mi.tags else [])
                    mi.tags = list(set(tags))
-            db.set_metadata(i, mi, commit=False, set_title=set_title,
+            db.set_metadata(book_id, mi, commit=False, set_title=set_title,
                    set_authors=set_authors, notify=False)
-            self.applied_ids.append(i)
+            self.applied_ids.append(book_id)
        except:
            import traceback
-            self.apply_failures.append((i, traceback.format_exc()))
+            self.apply_failures.append((book_id, traceback.format_exc()))

        try:
            if mi.cover:
@ -521,11 +565,6 @@ class EditMetadataAction(InterfaceAction):
        except:
            pass

-        self.apply_current_idx += 1
-        if self.apply_pd is not None:
-            self.apply_pd.value += 1
-        QTimer.singleShot(50, self.do_one_apply)
-
    def finalize_apply(self):
        db = self.gui.current_db
        db.commit()
--- a/src/calibre/gui2/auto_add.py
+++ b/src/calibre/gui2/auto_add.py
@ -113,6 +113,7 @@ class Worker(Thread):
 class AutoAdder(QObject):

    metadata_read = pyqtSignal(object)
+    auto_convert = pyqtSignal(object)

    def __init__(self, path, parent):
        QObject.__init__(self, parent)
@ -124,6 +125,8 @@ class AutoAdder(QObject):
            self.metadata_read.connect(self.add_to_db,
                    type=Qt.QueuedConnection)
            QTimer.singleShot(2000, self.initialize)
+            self.auto_convert.connect(self.do_auto_convert,
+                    type=Qt.QueuedConnection)
        elif path:
            prints(path,
                'is not a valid directory to watch for new ebooks, ignoring')
@ -163,6 +166,7 @@ class AutoAdder(QObject):

        needs_rescan = False
        duplicates = []
+        added_ids = set()

        for fname, tdir in data.iteritems():
            paths = [os.path.join(self.worker.path, fname)]
@ -187,9 +191,12 @@ class AutoAdder(QObject):
                continue
            mi = [OPF(open(mi, 'rb'), tdir,
                    populate_spine=False).to_book_metadata()]
-            dups, num = m.add_books(paths,
+            dups, ids = m.add_books(paths,
                    [os.path.splitext(fname)[1][1:].upper()], mi,
-                    add_duplicates=not gprefs['auto_add_check_for_duplicates'])
+                    add_duplicates=not gprefs['auto_add_check_for_duplicates'],
+                    return_ids=True)
+            added_ids |= set(ids)
+            num = len(ids)
            if dups:
                path = dups[0][0]
                with open(os.path.join(tdir, 'dup_cache.'+dups[1][0].lower()),
@ -217,8 +224,10 @@ class AutoAdder(QObject):
                        _('Books with the same title as the following already '
                        'exist in the database. Add them anyway?'),
                        '\n'.join(files)):
-             dups, num = m.add_books(paths, formats, metadata,
-                     add_duplicates=True)
+             dups, ids = m.add_books(paths, formats, metadata,
+                     add_duplicates=True, return_ids=True)
+             added_ids |= set(ids)
+             num = len(ids)
             count += num

        for tdir in data.itervalues():
@ -227,6 +236,9 @@ class AutoAdder(QObject):
            except:
                pass

+        if added_ids and gprefs['auto_add_auto_convert']:
+            self.auto_convert.emit(added_ids)
+
        if count > 0:
            m.books_added(count)
            gui.status_bar.show_message(_(
@ -238,4 +250,7 @@ class AutoAdder(QObject):
        if needs_rescan:
            QTimer.singleShot(2000, self.dir_changed)

+    def do_auto_convert(self, added_ids):
+        gui = self.parent()
+        gui.iactions['Convert Books'].auto_convert_auto_add(added_ids)

--- a/src/calibre/gui2/dialogs/message_box.py
+++ b/src/calibre/gui2/dialogs/message_box.py
@ -160,7 +160,7 @@ class ProceedNotification(MessageBox): # {{{

    def __init__(self, callback, payload, html_log, log_viewer_title, title, msg,
            det_msg='', show_copy_button=False, parent=None,
-            cancel_callback=None):
+            cancel_callback=None, log_is_file=False):
        '''
        A non modal popup that notifies the user that a background task has
        been completed.
@ -175,12 +175,15 @@ class ProceedNotification(MessageBox): # {{{
        :param title: The title for this popup
        :param msg: The msg to display
        :param det_msg: Detailed message
+        :param log_is_file: If True the html_log parameter is interpreted as
+        the path to a file on disk containing the log encoded with utf-8
        '''
        MessageBox.__init__(self, MessageBox.QUESTION, title, msg,
                det_msg=det_msg, show_copy_button=show_copy_button,
                parent=parent)
        self.payload = payload
        self.html_log = html_log
+        self.log_is_file = log_is_file
        self.log_viewer_title = log_viewer_title

        self.vlb = self.bb.addButton(_('View log'), self.bb.ActionRole)
@ -192,7 +195,11 @@ class ProceedNotification(MessageBox): # {{{
        _proceed_memory.append(self)

    def show_log(self):
-        self.log_viewer = ViewLog(self.log_viewer_title, self.html_log,
+        log = self.html_log
+        if self.log_is_file:
+            with open(log, 'rb') as f:
+                log = f.read().decode('utf-8')
+        self.log_viewer = ViewLog(self.log_viewer_title, log,
                parent=self)

    def do_proceed(self, result):
@ -202,9 +209,9 @@ class ProceedNotification(MessageBox): # {{{
        gui = get_gui()
        gui.proceed_requested.emit(func, self.payload)
        # Ensure this notification is garbage collected
+        self.vlb.clicked.disconnect()
        self.callback = self.cancel_callback = self.payload = None
        self.setParent(None)
-        self.vlb.clicked.disconnect()
        _proceed_memory.remove(self)

    def done(self, r):
--- a/src/calibre/gui2/dialogs/search.ui
+++ b/src/calibre/gui2/dialogs/search.ui
@ -140,34 +140,6 @@
            </item>
           </layout>
          </item>
-          <item>
-           <widget class="QGroupBox" name="groupBox">
-            <property name="maximumSize">
-             <size>
-              <width>16777215</width>
-              <height>60</height>
-             </size>
-            </property>
-            <layout class="QHBoxLayout" name="horizontalLayout_5">
-             <item>
-              <widget class="QLabel" name="label_51">
-               <property name="sizePolicy">
-                <sizepolicy hsizetype="Preferred" vsizetype="Preferred">
-                 <horstretch>40</horstretch>
-                 <verstretch>0</verstretch>
-                </sizepolicy>
-               </property>
-               <property name="text">
-                <string/>
-               </property>
-               <property name="buddy">
-                <cstring>matchkind</cstring>
-               </property>
-              </widget>
-             </item>
-            </layout>
-           </widget>
-          </item>
          <item>
           <widget class="QLabel" name="label_6">
            <property name="maximumSize">
--- a/src/calibre/gui2/jobs.py
+++ b/src/calibre/gui2/jobs.py
@ -402,7 +402,8 @@ class DetailView(QDialog, Ui_Dialog): # {{{
        self.setupUi(self)
        self.setWindowTitle(job.description)
        self.job = job
-        self.html_view = hasattr(job, 'html_details')
+        self.html_view = (hasattr(job, 'html_details') and not getattr(job,
+            'ignore_html_details', False))
        if self.html_view:
            self.log.setVisible(False)
        else:
--- a/src/calibre/gui2/library/models.py
+++ b/src/calibre/gui2/library/models.py
@ -187,9 +187,10 @@ class BooksModel(QAbstractTableModel): # {{{
        self.db = None
        self.reset()

-    def add_books(self, paths, formats, metadata, add_duplicates=False):
+    def add_books(self, paths, formats, metadata, add_duplicates=False,
+            return_ids=False):
        ret = self.db.add_books(paths, formats, metadata,
-                                 add_duplicates=add_duplicates)
+                add_duplicates=add_duplicates, return_ids=return_ids)
        self.count_changed()
        return ret

--- a/src/calibre/gui2/metadata/bulk_download.py
+++ b/src/calibre/gui2/metadata/bulk_download.py
@ -7,22 +7,41 @@ __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

+import os, time, shutil
 from functools import partial
-from itertools import izip
-from threading import Event

 from PyQt4.Qt import (QIcon, QDialog,
        QDialogButtonBox, QLabel, QGridLayout, QPixmap, Qt)

 from calibre.gui2.threaded_jobs import ThreadedJob
-from calibre.ebooks.metadata.sources.identify import identify, msprefs
-from calibre.ebooks.metadata.sources.covers import download_cover
-from calibre.ebooks.metadata.book.base import Metadata
-from calibre.customize.ui import metadata_plugins
-from calibre.ptempfile import PersistentTemporaryFile
-from calibre.utils.date import as_utc
+from calibre.ebooks.metadata.opf2 import metadata_to_opf
+from calibre.utils.ipc.simple_worker import fork_job, WorkerError
+from calibre.ptempfile import (PersistentTemporaryDirectory,
+        PersistentTemporaryFile)

 # Start download {{{
+
+class Job(ThreadedJob):
+
+    ignore_html_details = True
+
+    def consolidate_log(self):
+        self.consolidated_log = self.log.plain_text
+        self.log = None
+
+    def read_consolidated_log(self):
+        return self.consolidated_log
+
+    @property
+    def details(self):
+        if self.consolidated_log is None:
+            return self.log.plain_text
+        return self.read_consolidated_log()
+
+    @property
+    def log_file(self):
+        return open(self.download_debug_log, 'rb')
+
 def show_config(gui, parent):
    from calibre.gui2.preferences import show_config_widget
    show_config_widget('Sharing', 'Metadata download', parent=parent,
@ -104,19 +123,22 @@ def start_download(gui, ids, callback, ensure_fields=None):
    d.b.clicked.disconnect()
    if ret != d.Accepted:
        return
+    tf = PersistentTemporaryFile('_metadata_bulk_log_')
+    tf.close()

-    for batch in split_jobs(ids):
-        job = ThreadedJob('metadata bulk download',
-            _('Download metadata for %d books')%len(batch),
-            download, (batch, gui.current_db, d.identify, d.covers,
-                ensure_fields), {}, callback)
-        gui.job_manager.run_threaded_job(job)
+    job = Job('metadata bulk download',
+        _('Download metadata for %d books')%len(ids),
+        download, (ids, tf.name, gui.current_db, d.identify, d.covers,
+            ensure_fields), {}, callback)
+    job.download_debug_log = tf.name
+    gui.job_manager.run_threaded_job(job)
    gui.status_bar.show_message(_('Metadata download started'), 3000)

 # }}}

 def get_job_details(job):
-    id_map, failed_ids, failed_covers, title_map, all_failed = job.result
+    (aborted, good_ids, tdir, log_file, failed_ids, failed_covers, title_map,
+            lm_map, all_failed) = job.result
    det_msg = []
    for i in failed_ids | failed_covers:
        title = title_map[i]
@ -126,92 +148,89 @@ def get_job_details(job):
            title += (' ' + _('(Failed cover)'))
        det_msg.append(title)
    det_msg = '\n'.join(det_msg)
-    return id_map, failed_ids, failed_covers, all_failed, det_msg
+    return (aborted, good_ids, tdir, log_file, failed_ids, failed_covers,
+            all_failed, det_msg, lm_map)

-def merge_result(oldmi, newmi, ensure_fields=None):
-    dummy = Metadata(_('Unknown'))
-    for f in msprefs['ignore_fields']:
-        if ':' in f or (ensure_fields and f in ensure_fields):
-            continue
-        setattr(newmi, f, getattr(dummy, f))
-    fields = set()
-    for plugin in metadata_plugins(['identify']):
-        fields |= plugin.touched_fields
+class HeartBeat(object):
+    CHECK_INTERVAL = 300 # seconds
+    ''' Check that the file count in tdir changes every five minutes '''

-    def is_equal(x, y):
-        if hasattr(x, 'tzinfo'):
-            x = as_utc(x)
-        if hasattr(y, 'tzinfo'):
-            y = as_utc(y)
-        return x == y
+    def __init__(self, tdir):
+        self.tdir = tdir
+        self.last_count = len(os.listdir(self.tdir))
+        self.last_time = time.time()

-    for f in fields:
-        # Optimize so that set_metadata does not have to do extra work later
-        if not f.startswith('identifier:'):
-            if (not newmi.is_null(f) and is_equal(getattr(newmi, f),
-                    getattr(oldmi, f))):
-                setattr(newmi, f, getattr(dummy, f))
+    def __call__(self):
+        if time.time() - self.last_time > self.CHECK_INTERVAL:
+            c = len(os.listdir(self.tdir))
+            if c == self.last_count:
+                return False
+            self.last_count = c
+            self.last_time = time.time()
+        return True

-    newmi.last_modified = oldmi.last_modified
+# Fix log viewer, ratings
+# Test: abort, covers only, metadata only, both, 200 entry download, memory
+# consumption, all errors and on and on

-    return newmi
-
-def download(ids, db, do_identify, covers, ensure_fields,
+def download(all_ids, tf, db, do_identify, covers, ensure_fields,
        log=None, abort=None, notifications=None):
-    ids = list(ids)
-    metadata = [db.get_metadata(i, index_is_id=True, get_user_categories=False)
-        for i in ids]
+    batch_size = 10
+    batches = split_jobs(all_ids, batch_size=batch_size)
+    tdir = PersistentTemporaryDirectory('_metadata_bulk_')
+    heartbeat = HeartBeat(tdir)
+
    failed_ids = set()
    failed_covers = set()
    title_map = {}
-    ans = {}
-    count = 0
+    lm_map = {}
+    ans = set()
    all_failed = True
-    '''
-    # Test apply dialog
-    all_failed = do_identify = covers = False
-    '''
-    for i, mi in izip(ids, metadata):
+    aborted = False
+    count = 0
+
+    for ids in batches:
        if abort.is_set():
            log.error('Aborting...')
            break
-        title, authors, identifiers = mi.title, mi.authors, mi.identifiers
-        title_map[i] = title
-        if do_identify:
-            results = []
-            try:
-                results = identify(log, Event(), title=title, authors=authors,
-                    identifiers=identifiers)
-            except:
-                pass
-            if results:
-                all_failed = False
-                mi = merge_result(mi, results[0], ensure_fields=ensure_fields)
-                identifiers = mi.identifiers
-                if not mi.is_null('rating'):
-                    # set_metadata expects a rating out of 10
-                    mi.rating *= 2
-            else:
-                log.error('Failed to download metadata for', title)
-                failed_ids.add(i)
-                # We don't want set_metadata operating on anything but covers
-                mi = merge_result(mi, mi, ensure_fields=ensure_fields)
-        if covers:
-            cdata = download_cover(log, title=title, authors=authors,
-                    identifiers=identifiers)
-            if cdata is not None:
-                with PersistentTemporaryFile('.jpg', 'downloaded-cover-') as f:
-                    f.write(cdata[-1])
-                    mi.cover = f.name
-                all_failed = False
-            else:
-                failed_covers.add(i)
-        ans[i] = mi
-        count += 1
+        metadata = {i:db.get_metadata(i, index_is_id=True,
+            get_user_categories=False) for i in ids}
+        for i in ids:
+            title_map[i] = metadata[i].title
+            lm_map[i] = metadata[i].last_modified
+        metadata = {i:metadata_to_opf(mi, default_lang='und') for i, mi in
+                metadata.iteritems()}
+        try:
+            ret = fork_job('calibre.ebooks.metadata.sources.worker', 'main',
+                    (do_identify, covers, metadata, ensure_fields),
+                    cwd=tdir, abort=abort, heartbeat=heartbeat, no_output=True)
+        except WorkerError as e:
+            if e.orig_tb:
+                raise Exception('Failed to download metadata. Original '
+                        'traceback: \n\n'+e.orig_tb)
+            raise
+        count += batch_size
        notifications.put((count/len(ids),
-            _('Downloaded %(num)d of %(tot)d')%dict(num=count, tot=len(ids))))
+            _('Downloaded %(num)d of %(tot)d')%dict(
+                num=count, tot=len(all_ids))))
+
+        fids, fcovs, allf = ret['result']
+        if not allf:
+            all_failed = False
+        failed_ids = failed_ids.union(fids)
+        failed_covers = failed_covers.union(fcovs)
+        ans = ans.union(set(ids) - fids)
+        for book_id in ids:
+            lp = os.path.join(tdir, '%d.log'%book_id)
+            if os.path.exists(lp):
+                with open(tf, 'ab') as dest, open(lp, 'rb') as src:
+                    dest.write(('\n'+'#'*20 + ' Log for %s '%title_map[book_id] +
+                        '#'*20+'\n').encode('utf-8'))
+                    shutil.copyfileobj(src, dest)
+
+    if abort.is_set():
+        aborted = True
    log('Download complete, with %d failures'%len(failed_ids))
-    return (ans, failed_ids, failed_covers, title_map, all_failed)
-
-
+    return (aborted, ans, tdir, tf, failed_ids, failed_covers, title_map,
+            lm_map, all_failed)

--- a/src/calibre/gui2/metadata/single.py
+++ b/src/calibre/gui2/metadata/single.py
@ -161,10 +161,10 @@ class MetadataSingleDialogBase(ResizableDialog):
        self.manage_authors_button.clicked.connect(self.authors.manage_authors)

        self.series = SeriesEdit(self)
-        self.remove_unused_series_button = QToolButton(self)
-        self.remove_unused_series_button.setToolTip(
-               _('Remove unused series (Series that have no books)') )
-        self.remove_unused_series_button.clicked.connect(self.remove_unused_series)
+        self.clear_series_button = QToolButton(self)
+        self.clear_series_button.setToolTip(
+               _('Clear series') )
+        self.clear_series_button.clicked.connect(self.series.clear)
        self.series_index = SeriesIndexEdit(self, self.series)
        self.basic_metadata_widgets.extend([self.series, self.series_index])

@ -198,6 +198,7 @@ class MetadataSingleDialogBase(ResizableDialog):
        self.basic_metadata_widgets.append(self.identifiers)
        self.clear_identifiers_button = QToolButton(self)
        self.clear_identifiers_button.setIcon(QIcon(I('trash.png')))
+        self.clear_identifiers_button.setToolTip(_('Clear Ids'))
        self.clear_identifiers_button.clicked.connect(self.identifiers.clear)
        self.paste_isbn_button = QToolButton(self)
        self.paste_isbn_button.setToolTip('<p>' +
@ -303,17 +304,6 @@ class MetadataSingleDialogBase(ResizableDialog):
        self.title_sort.auto_generate()
        self.author_sort.auto_generate()

-    def remove_unused_series(self, *args):
-        self.db.remove_unused_series()
-        idx = self.series.current_val
-        self.series.clear()
-        self.series.initialize(self.db, self.book_id)
-        if idx:
-            for i in range(self.series.count()):
-                if unicode(self.series.itemText(i)) == idx:
-                    self.series.setCurrentIndex(i)
-                    break
-
    def tags_editor(self, *args):
        self.tags.edit(self.db, self.book_id)

@ -591,7 +581,7 @@ class MetadataSingleDialog(MetadataSingleDialogBase): # {{{
        sto(self.title_sort, self.authors)
        create_row(1, self.authors, self.deduce_author_sort_button, self.author_sort)
        sto(self.author_sort, self.series)
-        create_row(2, self.series, self.remove_unused_series_button,
+        create_row(2, self.series, self.clear_series_button,
                self.series_index, icon='trash.png')
        sto(self.series_index, self.swap_title_author_button)
        sto(self.swap_title_author_button, self.manage_authors_button)
@ -756,7 +746,7 @@ class MetadataSingleDialogAlt1(MetadataSingleDialogBase): # {{{
                   span=2, icon='auto_author_sort.png')
        create_row(3, self.author_sort, self.series)
        create_row(4, self.series, self.series_index,
-                   button=self.remove_unused_series_button, icon='trash.png')
+                   button=self.clear_series_button, icon='trash.png')
        create_row(5, self.series_index, self.tags)
        create_row(6, self.tags, self.rating, button=self.tags_editor_button)
        create_row(7, self.rating, self.pubdate)
@ -892,7 +882,7 @@ class MetadataSingleDialogAlt2(MetadataSingleDialogBase): # {{{
                   span=2, icon='auto_author_sort.png')
        create_row(3, self.author_sort, self.series)
        create_row(4, self.series, self.series_index,
-                   button=self.remove_unused_series_button, icon='trash.png')
+                   button=self.clear_series_button, icon='trash.png')
        create_row(5, self.series_index, self.tags)
        create_row(6, self.tags, self.rating, button=self.tags_editor_button)
        create_row(7, self.rating, self.pubdate)
--- a/src/calibre/gui2/preferences/adding.py
+++ b/src/calibre/gui2/preferences/adding.py
@ -36,6 +36,7 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
        r('new_book_tags', prefs, setting=CommaSeparatedList)
        r('auto_add_path', gprefs, restart_required=True)
        r('auto_add_check_for_duplicates', gprefs)
+        r('auto_add_auto_convert', gprefs)

        self.filename_pattern = FilenamePattern(self)
        self.metadata_box.layout().insertWidget(0, self.filename_pattern)
--- a/src/calibre/gui2/preferences/adding.ui
+++ b/src/calibre/gui2/preferences/adding.ui
@ -151,6 +151,19 @@ Author matching is exact.</string>
       <string>&amp;Automatic Adding</string>
      </attribute>
      <layout class="QGridLayout" name="gridLayout_3">
+       <item row="3" column="0" colspan="2">
+        <widget class="QCheckBox" name="opt_auto_add_check_for_duplicates">
+         <property name="toolTip">
+          <string>If set, this option will causes calibre to check if a file
+ being auto-added is already in the calibre library.
+ If it is, a meesage will pop up asking you whether
+ you want to add it anyway.</string>
+         </property>
+         <property name="text">
+          <string>Check for &amp;duplicates when auto-adding files</string>
+         </property>
+        </widget>
+       </item>
       <item row="0" column="0" colspan="2">
        <widget class="QLabel" name="label">
         <property name="text">
@ -168,7 +181,7 @@ Author matching is exact.</string>
         </property>
        </widget>
       </item>
-       <item row="4" column="0">
+       <item row="5" column="0">
        <widget class="QGroupBox" name="groupBox">
         <property name="title">
          <string>Ignore files with the following extensions when automatically adding </string>
@ -187,7 +200,7 @@ Author matching is exact.</string>
         </layout>
        </widget>
       </item>
-       <item row="4" column="1">
+       <item row="5" column="1">
        <spacer name="horizontalSpacer_2">
         <property name="orientation">
          <enum>Qt::Horizontal</enum>
@ -225,16 +238,10 @@ Author matching is exact.</string>
         </item>
        </layout>
       </item>
-       <item row="3" column="0" colspan="2">
-        <widget class="QCheckBox" name="opt_auto_add_check_for_duplicates">
-         <property name="toolTip">
-          <string>If set, this option will causes calibre to check if a file
- being auto-added is already in the calibre library.
- If it is, a meesage will pop up asking you whether
- you want to add it anyway.</string>
-         </property>
+       <item row="4" column="0">
+        <widget class="QCheckBox" name="opt_auto_add_auto_convert">
         <property name="text">
-          <string>Check for &amp;duplicates when auto-adding files</string>
+          <string>Automatically &amp;convert added files to the current output format</string>
         </property>
        </widget>
       </item>
--- a/src/calibre/gui2/store/opensearch_store.py
+++ b/src/calibre/gui2/store/opensearch_store.py
@ -73,11 +73,13 @@ class OpenSearchOPDSStore(StorePlugin):
                    type = link.get('type')
                    
                    if rel and href and type:
-                        if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
+                        if 'http://opds-spec.org/thumbnail' in rel:
                            s.cover_url = href
-                        elif rel == u'http://opds-spec.org/acquisition/buy':
+                        elif 'http://opds-spec.org/image/thumbnail' in rel:
+                            s.cover_url = href
+                        elif 'http://opds-spec.org/acquisition/buy' in rel:
                            s.detail_item = href
-                        elif rel == u'http://opds-spec.org/acquisition':
+                        elif 'http://opds-spec.org/acquisition' in rel:
                            if type:
                                ext = mimetypes.guess_extension(type)
                                if ext:
--- a/src/calibre/gui2/tools.py
+++ b/src/calibre/gui2/tools.py
@ -25,7 +25,7 @@ from calibre.ebooks.conversion.config import GuiRecommendations, \
 from calibre.gui2.convert import bulk_defaults_for_input_format

 def convert_single_ebook(parent, db, book_ids, auto_conversion=False, # {{{
-        out_format=None):
+        out_format=None, show_no_format_warning=True):
    changed = False
    jobs = []
    bad = []
@ -91,7 +91,7 @@ def convert_single_ebook(parent, db, book_ids, auto_conversion=False, # {{{
        except NoSupportedInputFormats:
            bad.append(book_id)

-    if bad != []:
+    if bad and show_no_format_warning:
        res = []
        for id in bad:
            title = db.title(id, True)
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@ -3243,7 +3243,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
        return id


-    def add_books(self, paths, formats, metadata, add_duplicates=True):
+    def add_books(self, paths, formats, metadata, add_duplicates=True,
+            return_ids=False):
        '''
        Add a book to the database. The result cache is not updated.
        :param:`paths` List of paths to book files or file-like objects
@ -3289,7 +3290,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
            formats  = list(duplicate[1] for duplicate in duplicates)
            metadata = list(duplicate[2] for duplicate in duplicates)
            return (paths, formats, metadata), len(ids)
-        return None, len(ids)
+        return None, (ids if return_ids else len(ids))

    def import_book(self, mi, formats, notify=True, import_hooks=True,
            apply_import_tags=True, preserve_uuid=False):
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -648,7 +648,10 @@ class BasicNewsRecipe(Recipe):
            'url'         : URL of print version,
            'date'        : The publication date of the article as a string,
            'description' : A summary of the article
-            'content'     : The full article (can be an empty string). This is used by FullContentProfile
+            'content'     : The full article (can be an empty string). Obsolete
+                            do not use, instead save the content to a temporary
+                            file and pass a file:///path/to/temp/file.html as
+                            the URL.
            }

        For an example, see the recipe for downloading `The Atlantic`.