GwR revisions - cdeType PDOC tag

2025-07-09 03:04:10 -04:00 · 2010-03-27 04:10:59 -07:00 · 2010-03-27 04:10:59 -07:00 · f7108d173b
commit f7108d173b
parent 9c38a1b5b8 655e13ed40
23 changed files with 901 additions and 210 deletions
--- a/.bzrignore
+++ b/.bzrignore
@ -22,3 +22,7 @@ src/cssutils/stylesheets/.svn/
 src/odf/.svn
 tags
 nbproject/
 *.mdproj
 *.pidb
 *.sln
 *.userprefs
--- a/resources/images/news/nypost.png
+++ b/resources/images/news/nypost.png
--- a/resources/images/news/tulsaworld.png
+++ b/resources/images/news/tulsaworld.png
--- a/resources/recipes/ieeespectrum.recipe
+++ b/resources/recipes/ieeespectrum.recipe
@ -0,0 +1,63 @@
 __license__   = 'GPL v3'
 __copyright__ = '2010, Franco Venturi <fventuri at comcast.net>'
 '''
 spectrum.ieee.org
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 from string import capwords
 from urlparse import urljoin
 class IEEESpectrum(BasicNewsRecipe):
    title                 = 'IEEE Spectrum'
    __author__            = 'Franco Venturi'
    description           = 'Electronics News from IEEE'
    publisher             = 'IEEE'
    category              = 'news, electronics, IT, computer science'
    oldest_article        = 32
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    language              = 'en'
    index                 = 'http://spectrum.ieee.org/magazine/'
    masthead_url          = 'http://spectrum.ieee.org/images/logo_hdr.png'
    remove_javascript     = True
    remove_tags           = [dict(name={'script':True, 'object':True})]
    remove_attributes     = ['height','width','alt']
    keep_only_tags        = [dict(attrs={'class': {'artSctn':True, 'artTitle':True, 'dekTitle': True}}), dict(attrs={'id':'artBody'})]
    def parse_index(self):
        soup = self.index_to_soup(self.index)
        img = soup.find('img', image='cover.gif', src=True)
        if img is not None:
            self.cover_url = 'http://spectrum.ieee.org'+img['src']
        content = soup.find(id='gnrlContent')
        title = content.find(attrs={'class':'style4'}).string.strip()
        date = ' '.join(title.split()[0:2])
        self.timefmt = ' [' + date + ']'
        contents = []
        for tag in content.findAll(attrs={'class': {'style2':True, 'lstngTitle':True, 'lstngBody': True}}):
            if tag['class'] == 'style2':
                contents.append((capwords(tag.renderContents().strip()), []))
            elif tag['class'] == 'lstngTitle':
                url = urljoin(self.index, tag.findPrevious('a')['href']) + '/0'
                contents[-1][1].append({'title': tag.renderContents().strip(),
                                        'url': url,
                                        'date': date,
                                        'description': '',
                                        'content': ''
                                       })
            elif tag['class'] == 'lstngBody':
                contents[-1][1][-1]['description'] = tag.renderContents().strip()
        return contents
    def preprocess_html(self, soup):
        for a in soup.findAll('a'):
            if not a['href'].lower().startswith('http'):
               a['href'] = urljoin(self.index, a['href'])
        return soup
--- a/resources/recipes/nypost.recipe
+++ b/resources/recipes/nypost.recipe
@ -0,0 +1,36 @@
 __license__   = 'GPL v3'
 __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
 nypost.com
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class NYPost(BasicNewsRecipe):
    title                 = 'New York Post'
    __author__            = 'Darko Miletic'
    description           = 'Daily newspaper'
    publisher             = 'NYP Holdings, Inc.'
    category              = 'news, politics, USA'
    oldest_article        = 2
    max_articles_per_feed = 200
    no_stylesheets        = True
    encoding              = 'utf8'
    use_embedded_content  = False
    language              = 'en'
    masthead_url          = 'http://www.nypost.com/rw/SysConfig/WebPortal/nypost/images/nyp_logo_230x32.gif'
    extra_css             = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} '
    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        }
    keep_only_tags=[dict(name='div', attrs={'id':'story'})]
    feeds = [(u'Articles', u'http://www.nypost.com/rss/all_section.xml')]
    def print_version(self, url):
        return url.replace('nypost.com/p/','nypost.com/f/print/')
--- a/resources/recipes/nytimes.recipe
+++ b/resources/recipes/nytimes.recipe
@ -5,7 +5,8 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
 nytimes.com
 '''
-import re, time
+import re
 import time
 from calibre import entity_to_unicode
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
@ -14,7 +15,7 @@ class NYTimes(BasicNewsRecipe):
    title       = 'New York Times Top Stories'
    __author__  = 'GRiker'
-    language = 'en'
+    language = _('English')
    description = 'Top Stories from the New York Times'
    # List of sections typically included in Top Stories.  Use a keyword from the
@ -39,9 +40,6 @@ class NYTimes(BasicNewsRecipe):
                 'world'            :   'World'
               }
    # By default, no sections are skipped.
    excludeSectionKeywords = []
    # Add section keywords from the right column above to skip that section
    # For example, to skip sections containing the word 'Sports' or 'Dining', use:
    # excludeSectionKeywords = ['Sports', 'Dining']
@ -49,36 +47,138 @@ class NYTimes(BasicNewsRecipe):
    # excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
    # Fetch only Top Stories
    # excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
    # By default, no sections are skipped.
    excludeSectionKeywords = []
    # one_picture_per_article specifies that calibre should only use the first image
    # from an article (if one exists).  If one_picture_per_article = True, the image
    # will be moved to a location between the headline and the byline.
    # If one_picture_per_article = False, all images from the article will be included
    # and shown in their original location.
    one_picture_per_article = True
    # The maximum number of articles that will be downloaded
    max_articles_per_feed = 40
    timefmt = ''
    needs_subscription = True
-    keep_only_tags          = [ dict(attrs={   'id':['article']}),
+    masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
                                dict(attrs={'class':['blog wrap']}) ]
-    remove_tags             = [ dict(attrs={'class':['nextArticleLink clearfix', 'clearfix',
+    remove_tags_before = dict(id='article')
-                                                     'inlineVideo left brightcove', 'entry-meta']}),
+    remove_tags_after  = dict(id='article')
-                                dict(attrs={   'id':['toolsRight','inlineBox','sidebarArticles',
+    remove_tags = [dict(attrs={'class':[
-                                                     'portfolioInline','articleInline','readerscomment',
+                            'articleFooter',
-                                                     'nytRating']}) ]
+                            'articleTools',
                            'columnGroup doubleRule',
                            'columnGroup singleRule',
                            'columnGroup last',
                            'columnGroup  last',
                            'doubleRule',
                            'dottedLine',
                            'entry-meta',
                            'icon enlargeThis',
                            'leftNavTabs',
                            'module box nav',
                            'nextArticleLink',
                            'nextArticleLink clearfix',
                            'post-tools',
                            'relatedSearchesModule',
                            'side_tool',
                            'singleAd',
                            'subNavigation tabContent active clearfix',
                            ]}),
                   dict(id=[
                            'adxLeaderboard',
                            'archive',
                            'articleExtras',
                            'articleInline',
                            'blog_sidebar',
                            'cCol',
                            'entertainmentSearchBar',
                            'footer',
                            'header',
                            'header_search',
                            'login',
                            'masthead',
                            'memberTools',
                            'navigation',
                            'portfolioInline',
                            'relatedArticles',
                            'side_search',
                            'side_index',
                            'side_tool',
                            'toolsRight',
                            ]),
                   dict(name=['script', 'noscript', 'style'])]
    encoding = 'cp1252'
    no_stylesheets = True
    extra_css = '.headline      {text-align:    left;}\n    \
                 .byline        {font-family:   monospace;  \
                                 text-align:    left;       \
                                 margin-top:    0px;        \
                                 margin-bottom: 0px;}\n     \
                 .dateline      {font-size:     small;      \
                                 margin-top:    0px;        \
                                 margin-bottom: 0px;}\n     \
                 .timestamp     {font-size:     small;      \
                                 margin-top:    0px;        \
                                 margin-bottom: 0px;}\n     \
                 .timestamp     {font-size:     smaller;}\n \
                 .source        {text-align:    left;}\n    \
                 .image         {text-align:    center;}\n  \
                 .credit        {text-align:    right;      \
-                                 font-size:     smaller;}\n \
+                                 font-size:     small;      \
                                 margin-top:    0px;        \
                                 margin-bottom: 0px;}\n     \
                 .articleBody   {text-align:    left;}\n    \
                 .authorId      {text-align:    left;       \
                                 font-style:    italic;}\n  '
    def dump_ans(self, ans) :
        total_article_count = 0
        for section in ans :
            if self.verbose:
                self.log("section %s: %d articles" % (section[0], len(section[1])) )
            for article in section[1]:
                total_article_count += 1
                if self.verbose:
                    self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
                              article['url'].encode('cp1252','replace')))
        self.log( "Queued %d articles" % total_article_count )
    def fixChars(self,string):
        # Replace lsquo (\x91)
        fixed = re.sub("\x91","&#8216;",string)
        # Replace rsquo (\x92)
        fixed = re.sub("\x92","&#8217;",fixed)
        # Replace ldquo (\x93)
        fixed = re.sub("\x93","&#8220;",fixed)
        # Replace rdquo (\x94)
        fixed = re.sub("\x94","&#8221;",fixed)
        # Replace ndash (\x96)
        fixed = re.sub("\x96","&#8211;",fixed)
        # Replace mdash (\x97)
        fixed = re.sub("\x97","&#8212;",fixed)
        return fixed
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            try:
                br.open('http://www.nytimes.com/auth/login')
                br.select_form(name='login')
                br['USERID']   = self.username
                br['PASSWORD'] = self.password
                br.submit()
            except:
                self.log("\nFailed to login")
        return br
    def get_cover_url(self):
        cover = None
        st = time.localtime()
@ -94,26 +194,6 @@ class NYTimes(BasicNewsRecipe):
            cover = None
        return cover
    def get_masthead_url(self):
        masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
        br = BasicNewsRecipe.get_browser()
        try:
            br.open(masthead)
        except:
            self.log("\nCover unavailable")
            masthead = None
        return masthead
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open('http://www.nytimes.com/auth/login')
            br.select_form(name='login')
            br['USERID']   = self.username
            br['PASSWORD'] = self.password
            br.submit()
        return br
    def index_to_soup(self, url_or_raw, raw=False):
        '''
        OVERRIDE of class method
@ -138,6 +218,7 @@ class NYTimes(BasicNewsRecipe):
            return BeautifulSoup(_raw, markupMassage=massage)
        # Entry point
        print "index_to_soup()"
        soup = get_the_soup( self.encoding, url_or_raw )
        contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
        docEncoding =  str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
@ -151,6 +232,16 @@ class NYTimes(BasicNewsRecipe):
        return soup
    def massageNCXText(self, description):
        # Kindle TOC descriptions won't render certain characters
        if description:
            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
            # Replace '&' with '&#38;'
            massaged = re.sub("&","&#38;", massaged)
            return self.fixChars(massaged)
        else:
            return description
    def parse_index(self):
        articles = {}
        ans = []
@ -158,12 +249,14 @@ class NYTimes(BasicNewsRecipe):
        feed = key = 'All Top Stories'
        articles[key] = []
        ans.append(key)
        self.log("Scanning 1 section ...")
        soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
        # Fetch the outer table
        table = soup.find('table')
        previousTable = table
        contentTable = None
        # Find the deepest table containing the stories
        while True :
@ -191,8 +284,9 @@ class NYTimes(BasicNewsRecipe):
                continue
        skipThisSection = False
-
+        todays_article_count = 0
        # Within this table are <font face="times new roman, times, san serif"> entries
        self.log("Fetching feed Top Stories")
        for tr in storyblock.findAllNext('tr'):
            if tr.find('span') is not None :
@ -244,6 +338,7 @@ class NYTimes(BasicNewsRecipe):
                    # Fetch the article titles and URLs
                    articleCount = len(sectionblock.findAll('span'))
                    todays_article_count += articleCount
                    for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) :
                        a = span.find('a', href=True)
                        url = re.sub(r'\?.*', '', a['href'])
@ -277,6 +372,7 @@ class NYTimes(BasicNewsRecipe):
                            if duplicateFound:
                                # Continue fetching, don't add this article
                                todays_article_count -= 1
                                continue
                        if not articles.has_key(feed):
@ -284,11 +380,138 @@ class NYTimes(BasicNewsRecipe):
                        articles[feed].append(
                            dict(title=title, url=url, date=pubdate,
                                 description=description, author=author, content=''))
 #        self.log("Queuing %d articles from %s" % (todays_article_count, "Top Stories"))
        ans = self.sort_index_by(ans, {'Top Stories':-1})
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        self.dump_ans(ans)
        return ans
    def preprocess_html(self, soup):
        return self.strip_anchors(soup)
    def postprocess_html(self,soup, True):
        if self.one_picture_per_article:
            # Remove all images after first
            largeImg = soup.find(True, {'class':'articleSpanImage'})
            inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
            if largeImg:
                for inlineImg in inlineImgs:
                    inlineImg.extract()
            else:
                if inlineImgs:
                    firstImg = inlineImgs[0]
                    for inlineImg in inlineImgs[1:]:
                        inlineImg.extract()
                    # Move firstImg after headline
                    cgFirst = soup.find(True, {'class':'columnGroup  first'})
                    if cgFirst:
                        # Strip all sibling NavigableStrings: noise
                        navstrings = cgFirst.findAll(text=True, recursive=False)
                        [ns.extract() for ns in navstrings]
                        headline_found = False
                        tag = cgFirst.find(True)
                        insertLoc = 0
                        while True:
                            insertLoc += 1
                            if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
                                    headline_found = True
                                    break
                            tag = tag.nextSibling
                            if not tag:
                                headline_found = False
                                break
                        if headline_found:
                            cgFirst.insert(insertLoc,firstImg)
                    else:
                        self.log(">>> No class:'columnGroup  first' found <<<")
        # Change class="kicker" to <h3>
        kicker = soup.find(True, {'class':'kicker'})
        if kicker and kicker.contents[0]:
            h3Tag = Tag(soup, "h3")
            h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker,
                             use_alt=False)))
            kicker.replaceWith(h3Tag)
        # Change captions to italic -1
        for caption in soup.findAll(True, {'class':'caption'}) :
            if caption and caption.contents[0]:
                emTag = Tag(soup, "em")
                c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
                mp_off = c.find("More Photos")
                if mp_off >= 0:
                    c = c[:mp_off]
                emTag.insert(0, c)
                hrTag = Tag(soup, 'hr')
                #hrTag['style'] = "margin-top:0em;margin-bottom:0em"
                emTag.insert(1, hrTag)
                caption.replaceWith(emTag)
        # Change <nyt_headline> to <h2>
        h1 = soup.find('h1')
        if h1:
            headline = h1.find("nyt_headline")
            if headline:
                tag = Tag(soup, "h2")
                tag['class'] = "headline"
                tag.insert(0, self.fixChars(headline.contents[0]))
                h1.replaceWith(tag)
        else:
            # Blog entry - replace headline, remove <hr> tags
            headline = soup.find('title')
            if headline:
                tag = Tag(soup, "h2")
                tag['class'] = "headline"
                tag.insert(0, self.fixChars(headline.contents[0]))
                soup.insert(0, tag)
                hrs = soup.findAll('hr')
                for hr in hrs:
                    hr.extract()
        # Change <h1> to <h3> - used in editorial blogs
        masthead = soup.find("h1")
        if masthead:
            # Nuke the href
            if masthead.a:
                del(masthead.a['href'])
            tag = Tag(soup, "h3")
            tag.insert(0, self.fixChars(masthead.contents[0]))
            masthead.replaceWith(tag)
        # Change <span class="bold"> to <b>
        for subhead in soup.findAll(True, {'class':'bold'}) :
            if subhead.contents:
                bTag = Tag(soup, "b")
                bTag.insert(0, subhead.contents[0])
                subhead.replaceWith(bTag)
        # Synthesize a section header
        dsk = soup.find('meta', attrs={'name':'dsk'})
        if dsk and dsk.has_key('content'):
            hTag = Tag(soup,'h3')
            hTag['class'] = 'section'
            hTag.insert(0,NavigableString(dsk['content']))
            articleTag = soup.find(True, attrs={'id':'article'})
            if articleTag:
                articleTag.insert(0,hTag)
        # Add class="articleBody" to <div> so we can format with CSS
        divTag = soup.find('div',attrs={'id':'articleBody'})
        if divTag:
            divTag['class'] = divTag['id']
        # Add class="authorId" to <div> so we can format with CSS
        divTag = soup.find('div',attrs={'id':'authorId'})
        if divTag and divTag.contents[0]:
            tag = Tag(soup, "p")
            tag['class'] = "authorId"
            tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
                             use_alt=False)))
            divTag.replaceWith(tag)
        return soup
    def strip_anchors(self,soup):
        paras = soup.findAll(True)
        for para in paras:
@ -297,94 +520,3 @@ class NYTimes(BasicNewsRecipe):
                if a.img is None:
                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
        return soup
    def preprocess_html(self, soup):
 #         refresh = soup.find('meta', {'http-equiv':'refresh'})
 #         if refresh is None:
 #             return self.strip_anchors(soup)
 #
 #         content = refresh.get('content').partition('=')[2]
 #         raw = self.browser.open('http://www.nytimes.com'+content).read()
 #         soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
        return self.strip_anchors(soup)
        refresh = soup.find('meta', {'http-equiv':'refresh'})
        if refresh is not None:
            content = refresh.get('content').partition('=')[2]
            raw = self.browser.open('http://www.nytimes.com'+content).read()
            soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
        soup = self.strip_anchors(soup)
        # Test for empty content
        body = soup.find('body')
        tagCount = len(body.findAll(True))
        if tagCount:
 #            print "%d tags in article" % tagCount
            return soup
        else:
            print "no allowed content found, removing article"
            raise Exception
    def postprocess_html(self,soup, True):
        # Change class="kicker" to <h3>
        kicker = soup.find(True, {'class':'kicker'})
        if kicker is not None :
            h3Tag = Tag(soup, "h3")
            h3Tag.insert(0, kicker.contents[0])
            kicker.replaceWith(h3Tag)
        # Change captions to italic -1
        for caption in soup.findAll(True, {'class':'caption'}) :
            if caption is not None:
                emTag = Tag(soup, "em")
                emTag.insert(0, caption.contents[0])
                hrTag = Tag(soup, 'hr')
                emTag.insert(1, hrTag)
                caption.replaceWith(emTag)
        # Change <nyt_headline> to <h2>
        headline = soup.find("nyt_headline")
        if headline is not None :
            tag = Tag(soup, "h2")
            tag['class'] = "headline"
            tag.insert(0, headline.contents[0])
            soup.h1.replaceWith(tag)
        # Change <h1> to <h3> - used in editorial blogs
        masthead = soup.find("h1")
        if masthead is not None :
            # Nuke the href
            if masthead.a is not None :
                del(masthead.a['href'])
            tag = Tag(soup, "h3")
            tag.insert(0, masthead.contents[0])
            soup.h1.replaceWith(tag)
        # Change <span class="bold"> to <b>
        for subhead in soup.findAll(True, {'class':'bold'}) :
            bTag = Tag(soup, "b")
            bTag.insert(0, subhead.contents[0])
            subhead.replaceWith(bTag)
        # Synthesize a section header
        dsk = soup.find('meta', attrs={'name':'dsk'})
        if dsk is not None and dsk.has_key('content'):
            hTag = Tag(soup,'h3')
            hTag['class'] = 'section'
            hTag.insert(0,NavigableString(dsk['content']))
            articleTag = soup.find(True, attrs={'id':'article'})
            articleTag.insert(0,hTag)
        # Add class="articleBody" to <div> so we can format with CSS
        divTag = soup.find('div',attrs={'id':'articleBody'})
        if divTag is not None :
            divTag['class'] = divTag['id']
        # Add class="authorId" to <div> so we can format with CSS
        divTag = soup.find('div',attrs={'id':'authorId'})
        if divTag is not None :
            divTag['class'] = divTag['id']
        return soup
--- a/resources/recipes/nytimes_sub.recipe
+++ b/resources/recipes/nytimes_sub.recipe
@ -31,7 +31,7 @@ class NYTimes(BasicNewsRecipe):
    # List of sections to exclude
    # To add a section, copy the section name from the allSectionKeywords list above
    # For example, to exclude 'Dining' and 'Weddings':
-    # excludeSectionKeywords = ['Dining','Weddings']
+    #excludeSectionKeywords = ['Dining','Weddings']
    excludeSectionKeywords = []
    # List of sections to include (test and debug only)
@ -56,20 +56,25 @@ class NYTimes(BasicNewsRecipe):
    remove_tags_before = dict(id='article')
    remove_tags_after  = dict(id='article')
    remove_tags = [dict(attrs={'class':[
                            'articleFooter',
                            'articleTools',
                            'columnGroup doubleRule',
                            'columnGroup singleRule',
                            'columnGroup last',
                            'columnGroup  last',
                            'doubleRule',
                            'dottedLine',
                            'entry-meta',
                            'icon enlargeThis',
                            'leftNavTabs',
                            'module box nav',
                            'nextArticleLink',
                            'nextArticleLink clearfix',
                            'post-tools',
                            'relatedSearchesModule',
                            'side_tool',
                            'singleAd',
                            'subNavigation tabContent active clearfix',
                            ]}),
                   dict(id=[
                            'adxLeaderboard',
@ -222,11 +227,11 @@ class NYTimes(BasicNewsRecipe):
            if div['class'] == 'section-headline':
                key = string.capwords(feed_title(div))
-                excluded = re.compile('|'.join(self.excludeSectionKeywords))
+                if self.excludeSectionKeywords:
-                if excluded.search(key):
+                    excluded = re.compile('|'.join(self.excludeSectionKeywords))
-                    self.log("Skipping section %s" % key)
+                    if excluded.search(key):
-                    continue
+                        self.log("Skipping section %s" % key)
-
+                        continue
                articles[key] = []
                ans.append(key)
--- a/resources/recipes/times_online.recipe
+++ b/resources/recipes/times_online.recipe
@ -21,9 +21,8 @@ class Timesonline(BasicNewsRecipe):
    use_embedded_content   = False
    simultaneous_downloads = 1
    encoding               = 'ISO-8859-1'
    lang                   = 'en-UK'
    remove_javascript = True
-    language = 'en'
+    language = 'en_GB'
    recursions = 9
    match_regexps = [r'http://www.timesonline.co.uk/.*page=[2-9]']
--- a/resources/recipes/tulsaworld.recipe
+++ b/resources/recipes/tulsaworld.recipe
@ -0,0 +1,47 @@
 __license__   = 'GPL v3'
 __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
 tulsaworld.com
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class TulsaWorld(BasicNewsRecipe):
    title                 = 'Tulsa World'
    __author__            = 'Darko Miletic'
    description           = 'Find breaking news, local news, Oklahoma weather, sports, business, entertainment, lifestyle, opinion, government, movies, books, jobs, education, blogs, video & multimedia.'
    publisher             = 'World Publishing Co.'
    category              = 'Tulsa World, tulsa world, daily newspaper, breaking news, stories, articles, news, local, weather, coverage, editorial, government, education, community, sports, business, entertainment, lifestyle, opinion, multimedia, media, blogs, consumer, OU, OSU, TU, ORU, football, basketball, school, schools, sudoku, movie reviews, stocks, classified ads, classifieds, books, job, jobs, careers, real estate, home, homes, Oklahoma, northeastern, reviews, auto, autos, archives, forecasts, Sooners, Cowboys, Hurricane, Golden Eagles, NFL, NBA, MLB, pro football, scores, college basketball, college football, college baseball, sports columns, fashion and style, associated press, regional news coverage, health, obituaries, politics, political news, Jenks, Union, Owasso, Tulsa, Booker T. Washington, Trojans, Rams, Hornets, video, photography, photos, images, games, search, the picker, predictions, satellite, family, food, teens, polls, births, celebrations, death notices, divorces, marriages, obituaries, audio, podcasts.'
    oldest_article        = 2
    max_articles_per_feed = 200
    no_stylesheets        = True
    encoding              = 'utf8'
    use_embedded_content  = False
    language              = 'en'
    country               = 'US'
    remove_empty_feeds    = True
    masthead_url          = 'http://www.tulsaworld.com/images/TW_logo-blue-footer.jpg'
    extra_css             = ' body{font-family: Arial,Verdana,sans-serif } img{margin-bottom: 0.4em} .articleHeadline{font-size: xx-large; font-weight: bold} .articleKicker{font-size: x-large; font-weight: bold} .articleByline,.articleDate{font-size: small} .leadp{font-size: 1.1em} '
    conversion_options = {
                          'comment'          : description
                        , 'tags'             : category
                        , 'publisher'        : publisher
                        , 'language'         : language
                        , 'linearize_tables' : True
                        }
    keep_only_tags = [dict(name='div',attrs={'id':['ctl00_body1_ArticleControl_divArticleText','ctl00_BodyContent_ArticleControl_divArticleText']})]
    feeds = [
              (u'News'    , u'http://www.tulsaworld.com/site/rss.aspx?group=1')
             ,(u'Business', u'http://www.tulsaworld.com/site/rss.aspx?group=5')
             ,(u'Opinion' , u'http://www.tulsaworld.com/site/rss.aspx?group=7')
            ]
    def get_article_url(self, article):        
        return article.get('link',  None).rpartition('&rss')[0]
    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        return self.adeify_images(soup)
--- a/resources/recipes/usatoday.recipe
+++ b/resources/recipes/usatoday.recipe
@ -7,62 +7,430 @@ usatoday.com
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, NavigableString, Tag
 import re
 class USAToday(BasicNewsRecipe):
    title = 'USA Today'
-    timefmt  = ' [%d %b %Y]'
+    __author__ = 'GRiker'
-    __author__ = 'Kovid Goyal and Sujata Raman'
+    oldest_article = 1
    timefmt  = ''
    max_articles_per_feed = 20
    language = 'en'
    no_stylesheets = True
-    extra_css = '''
+    extra_css = '.headline      {text-align:    left;}\n    \
-            .inside-head{font-family:Arial,Helvetica,sans-serif; font-size:large; font-weight:bold }
+                 .byline        {font-family:   monospace;  \
-            .inside-head2{font-family:Arial,Helvetica,sans-serif; font-size:large; font-weight:bold }
+                                 text-align:    left;       \
-            .inside-head3{font-family:Arial,Helvetica,sans-serif; font-size:large; font-weight:bold }
+                                 margin-bottom: 1em;}\n     \
-            h3{font-family:Arial,Helvetica,sans-serif; font-size:large; font-weight:bold; }
+                 .image         {text-align:    center;}\n  \
-            h4{font-family:Arial,Helvetica,sans-serif; font-size:x-small; font-weight:bold; }
+                 .caption       {text-align:    center;     \
-            .side-by-side{font-family:Arial,Helvetica,sans-serif; font-size:x-small;}
+                                 font-size:     smaller;    \
-            #byLineTag{font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
+                                 font-style:    italic}\n   \
-            .inside-copy{font-family:Arial,Helvetica,sans-serif; font-size:x-small;text-align:left;}
+                 .credit        {text-align:    right;      \
-            .caption{font-family:Arial,Helvetica,sans-serif; font-size:x-small;}
+                                 margin-bottom: 0em;        \
-            li{font-family:Arial,Helvetica,sans-serif; font-size:x-small;text-align:left ;}
+                                 font-size:     smaller;}\n \
-            .vatext{font-family:Arial,Helvetica,sans-serif; font-size:x-small;text-align:left ;}
+                 .articleBody   {text-align:    left;}\n    '
            .vaTextBold{font-family:Arial,Helvetica,sans-serif; font-size:x-small;font-weight:bold; color:#666666;}
            '''
    remove_tags = [
                   {'class':['tagListLabel','piped-taglist-string','socialcontainer','social-wrapper',]},
                   {'id':['topSocialButtons']},
                  ]
    conversion_options = { 'linearize_tables' : True }
-
+    #simultaneous_downloads = 1
    preprocess_regexps = [
        (re.compile(r'<BODY.*?<!--Article Goes Here-->', re.IGNORECASE | re.DOTALL), lambda match : '<BODY>'),
        (re.compile(r'<!--Article End-->.*?</BODY>', re.IGNORECASE | re.DOTALL), lambda match : '</BODY>'),
        ]
    feeds =  [
                ('Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'),
                ('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'),
                ('Tech Headlines', 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'),
                ('Personal Tech', 'http://rssfeeds.usatoday.com/UsatodaycomTech-PersonalTalk'),
                ('Science', 'http://rssfeeds.usatoday.com/TP-ScienceFair'),
                ('Health', 'http://rssfeeds.usatoday.com/UsatodaycomHealth-TopStories'),
                ('Travel Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomTravel-TopStories'),
                ('Money Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomMoney-TopStories'),
                ('Entertainment Headlines', 'http://rssfeeds.usatoday.com/usatoday-LifeTopStories'),
                ('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'),
                ('Weather Headlines', 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'),
                ('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'),
                ('Offbeat News', 'http://rssfeeds.usatoday.com/UsatodaycomOffbeat-TopStories'),
                ]
    keep_only_tags = [dict(attrs={'class':[
                                           'byLine',
                                           'inside-copy',
                                           'inside-head',
                                           'inside-head2',
                                           'item',
                                           'item-block',
                                           'photo-container',
                                           ]}),
                      dict(id=[
                               'applyMainStoryPhoto',
                               'permalink',
                               ])]
-    ## Getting the print version
+    remove_tags = [dict(attrs={'class':[
                                        'comments',
                                        'jump',
                                        'pagetools',
                                        'post-attributes',
                                        'tags',
                                        ]}),
                   dict(id=[])]
-    def print_version(self, url):
+    #feeds =  [('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles')]
-        return 'http://www.printthis.clickability.com/pt/printThis?clickMap=printThis&fb=Y&url=' + url
+
    def dump_hex(self, src, length=16):
        ''' Diagnostic '''
        FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
        N=0; result=''
        while src:
           s,src = src[:length],src[length:]
           hexa = ' '.join(["%02X"%ord(x) for x in s])
           s = s.translate(FILTER)
           result += "%04X   %-*s   %s\n" % (N, length*3, hexa, s)
           N+=length
        print result
    def fixChars(self,string):
        # Replace lsquo (\x91)
        fixed = re.sub("\x91","&#8216;",string)
        # Replace rsquo (\x92)
        fixed = re.sub("\x92","&#8217;",fixed)
        # Replace ldquo (\x93)
        fixed = re.sub("\x93","&#8220;",fixed)
        # Replace rdquo (\x94)
        fixed = re.sub("\x94","&#8221;",fixed)
        # Replace ndash (\x96)
        fixed = re.sub("\x96","&#8211;",fixed)
        # Replace mdash (\x97)
        fixed = re.sub("\x97","&#8212;",fixed)
        return fixed
    def get_masthead_url(self):
        masthead = 'http://i.usatoday.net/mobile/_common/_images/565x73_usat_mobile.gif'
        br = BasicNewsRecipe.get_browser()
        try:
            br.open(masthead)
        except:
            self.log("\nCover unavailable")
            masthead = None
        return masthead
    def massageNCXText(self, description):
        # Kindle TOC descriptions won't render certain characters
        if description:
            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
            # Replace '&' with '&#38;'
            massaged = re.sub("&","&#38;", massaged)
            return self.fixChars(massaged)
        else:
            return description
    def parse_feeds(self, *args, **kwargs):
        parsed_feeds = BasicNewsRecipe.parse_feeds(self, *args, **kwargs)
        # Count articles for progress dialog
        content_feeds = []
        article_count = 0
        for feed in parsed_feeds:
            article_count += len(feed)
        self.log( "Queued %d articles" % article_count)
        return parsed_feeds
    def preprocess_html(self, soup):
        soup = self.strip_anchors(soup)
        return soup
    def postprocess_html(self, soup, first_fetch):
-        for t in soup.findAll(['table', 'tr', 'td']):
+
-            t.name = 'div'
+        # Remove navLinks <div class="inside-copy" style="padding-bottom:3px">
        navLinks = soup.find(True,{'style':'padding-bottom:3px'})
        if navLinks:
            navLinks.extract()
        # Remove <div class="inside-copy" style="margin-bottom:10px">
        gibberish = soup.find(True,{'style':'margin-bottom:10px'})
        if gibberish:
            gibberish.extract()
        # Change <inside-head> to <h2>
        headline = soup.find(True, {'class':['inside-head','inside-head2']})
        if not headline:
            headline = soup.find('h3')
        if headline:
            tag = Tag(soup, "h2")
            tag['class'] = "headline"
            tag.insert(0, headline.contents[0])
            headline.replaceWith(tag)
        else:
            print "unable to find headline:\n%s\n" % soup
        # Change byLine to byline, change commas to middot
        # Kindle renders commas in byline as '&'
        byline = soup.find(True, {'class':'byLine'})
        if byline:
            byline['class'] = 'byline'
            # Replace comma with middot
            byline.contents[0].replaceWith(re.sub(","," &middot;", byline.renderContents()))
        jumpout_punc_list = [':','?']
        # Remove the inline jumpouts in <div class="inside-copy">
        paras = soup.findAll(True, {'class':'inside-copy'})
        for para in paras:
            if re.match("<b>[\w\W]+ ",para.renderContents()):
                p = para.find('b')
                for punc in jumpout_punc_list:
                    punc_offset = p.contents[0].find(punc)
                    if punc_offset == -1:
                        continue
                    if punc_offset > 1:
                        if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper():
                            #print "extracting \n%s\n" % para.prettify()
                            para.extract()
        # Reset class for remaining
        paras = soup.findAll(True, {'class':'inside-copy'})
        for para in paras:
            para['class'] = 'articleBody'
        # Remove inline jumpouts in <p>
        paras = soup.findAll(['p'])
        for p in paras:
            if hasattr(p,'contents') and len(p.contents):
                for punc in jumpout_punc_list:
                    punc_offset = p.contents[0].find(punc)
                    if punc_offset == -1:
                        continue
                    if punc_offset > 2 and hasattr(p,'a') and len(p.contents):
                        #print "evaluating %s\n" % p.contents[0][:punc_offset+1]
                        if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper():
                            #print "extracting \n%s\n" % p.prettify()
                            p.extract()
        # Capture the first img, insert after headline
        imgs = soup.findAll('img')
        print "postprocess_html(): %d images" % len(imgs)
        if imgs:
            divTag = Tag(soup, 'div')
            divTag['class'] = 'image'
            body = soup.find('body')
            img = imgs[0]
            #print "img: \n%s\n" % img.prettify()
            # Table for photo and credit
            tableTag = Tag(soup,'table')
            # Photo
            trimgTag = Tag(soup, 'tr')
            tdimgTag = Tag(soup, 'td')
            tdimgTag.insert(0,img)
            trimgTag.insert(0,tdimgTag)
            tableTag.insert(0,trimgTag)
            # Credit
            trcreditTag = Tag(soup, 'tr')
            tdcreditTag = Tag(soup, 'td')
            tdcreditTag['class'] = 'credit'
            credit = soup.find('td',{'class':'photoCredit'})
            if credit:
                tdcreditTag.insert(0,NavigableString(credit.renderContents()))
            else:
                credit = img['credit']
                if credit:
                    tdcreditTag.insert(0,NavigableString(credit))
                else:
                    tdcreditTag.insert(0,NavigableString(''))
            trcreditTag.insert(0,tdcreditTag)
            tableTag.insert(1,trcreditTag)
            dtc = 0
            divTag.insert(dtc,tableTag)
            dtc += 1
            if False:
                # Add the caption in the table
                tableCaptionTag = Tag(soup,'caption')
                tableCaptionTag.insert(0,soup.find('td',{'class':'photoCredit'}).renderContents())
                tableTag.insert(1,tableCaptionTag)
                divTag.insert(dtc,tableTag)
                dtc += 1
                body.insert(1,divTag)
            else:
                # Add the caption below the table
                #print "Looking for caption in this soup:\n%s" % img.prettify()
                captionTag = Tag(soup,'p')
                captionTag['class'] = 'caption'
                if hasattr(img,'alt') and img['alt']:
                    captionTag.insert(0,NavigableString('<blockquote>%s</blockquote>' % img['alt']))
                    divTag.insert(dtc, captionTag)
                    dtc += 1
                else:
                    try:
                        captionTag.insert(0,NavigableString('<blockquote>%s</blockquote>' % img['cutline']))
                        divTag.insert(dtc, captionTag)
                        dtc += 1
                    except:
                        pass
            hrTag = Tag(soup, 'hr')
            divTag.insert(dtc, hrTag)
            dtc += 1
            # Delete <div id="applyMainStoryPhoto"
            photoJunk = soup.find('div',{'id':'applyMainStoryPhoto'})
            if photoJunk:
                photoJunk.extract()
            # Insert img after headline
            tag = body.find(True)
            insertLoc = 0
            headline_found = False
            while True:
                # Scan the top-level tags
                insertLoc += 1
                if hasattr(tag,'class') and tag['class'] == 'headline':
                    headline_found = True
                    body.insert(insertLoc,divTag)
                    break
                tag = tag.nextSibling
                if not tag:
                    break
            if not headline_found:
                # Monolithic <div> - restructure
                insert_loc = 0
                tag = body.find(True)
                while True:
                    insertLoc += 1
                    try:
                        if hasattr(tag,'class') and tag['class'] == 'headline':
                            headline_found = True
                            tag.insert(insertLoc,divTag)
                            break
                    except:
                        pass
                    tag = tag.next
                    if not tag:
                        break
                # Yank out headline, img and caption
                headline = body.find('h2','headline')
                img = body.find('div','image')
                caption = body.find('p''class')
                # body(0) is calibre_navbar
                # body(1) is <div class="item">
                btc = 1
                headline.extract()
                body.insert(1, headline)
                btc += 1
                if img:
                    img.extract()
                    body.insert(btc, img)
                    btc += 1
                if caption:
                    caption.extract()
                    body.insert(btc, caption)
                    btc += 1
            if len(imgs) > 1:
                if True:
                    [img.extract() for img in imgs[1:]]
                else:
                    # Format the remaining images
                    # This doesn't work yet
                    for img in imgs[1:]:
                        print "img:\n%s\n" % img.prettify()
                        divTag = Tag(soup, 'div')
                        divTag['class'] = 'image'
                        # Table for photo and credit
                        tableTag = Tag(soup,'table')
                        # Photo
                        trimgTag = Tag(soup, 'tr')
                        tdimgTag = Tag(soup, 'td')
                        tdimgTag.insert(0,img)
                        trimgTag.insert(0,tdimgTag)
                        tableTag.insert(0,trimgTag)
                        # Credit
                        trcreditTag = Tag(soup, 'tr')
                        tdcreditTag = Tag(soup, 'td')
                        tdcreditTag['class'] = 'credit'
                        try:
                            tdcreditTag.insert(0,NavigableString(img['credit']))
                        except:
                            tdcreditTag.insert(0,NavigableString(''))
                        trcreditTag.insert(0,tdcreditTag)
                        tableTag.insert(1,trcreditTag)
                        divTag.insert(0,tableTag)
                        soup.img.replaceWith(divTag)
        return soup
    def postprocess_book(self, oeb, opts, log) :
        def extract_byline(href) :
            # <meta name="byline" content=
            soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
            byline = soup.find('div',attrs={'class':'byline'})
            if byline:
                byline['class'] = 'byline'
                # Replace comma with middot
                byline.contents[0].replaceWith(re.sub(","," &middot;", byline.renderContents()))
                return byline.renderContents()
            else :
                paras = soup.findAll(text=True)
                for para in paras:
                    if para.startswith("Copyright"):
                        return para[len('Copyright xxxx '):para.find('.')]
                return None
        def extract_description(href) :
            soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
            description = soup.find('meta',attrs={'name':'description'})
            if description :
                return self.massageNCXText(description['content'])
            else:
                # Take first paragraph of article
                articleBody = soup.find('div',attrs={'id':['articleBody','item']})
                if articleBody:
                    paras = articleBody.findAll('p')
                    for p in paras:
                        if p.renderContents() > '' :
                            return self.massageNCXText(self.tag_to_string(p,use_alt=False))
                else:
                    print "Didn't find <div id='articleBody'> in this soup:\n%s" % soup.prettify()
                    return None
        # Method entry point here
        # Single section toc looks different than multi-section tocs
        if oeb.toc.depth() == 2 :
            for article in oeb.toc :
                if article.author is None :
                    article.author = extract_byline(article.href)
                if article.description is None :
                    article.description = extract_description(article.href)
        elif oeb.toc.depth() == 3 :
            for section in oeb.toc :
                for article in section :
                    article.author = extract_byline(article.href)
                    '''
                    if article.author is None :
                        article.author = self.massageNCXText(extract_byline(article.href))
                    else:
                        article.author = self.massageNCXText(article.author)
                    '''
                    if article.description is None :
                        article.description = extract_description(article.href)
    def strip_anchors(self,soup):
        paras = soup.findAll(True)
        for para in paras:
            aTags = para.findAll('a')
            for a in aTags:
                if a.img is None:
                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
        return soup
--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@ -20,7 +20,7 @@ class ANDROID(USBMS):
    VENDOR_ID   = {
            0x0bb4 : { 0x0c02 : [0x100], 0x0c01 : [0x100]},
            0x22b8 : { 0x41d9 : [0x216]},
-            0x18d1 : { 0x4e11 : [0x0100], 0x4e12: [0x0100]},
+            0x18d1 : { 0x4e11 : [0x0100, 0x226], 0x4e12: [0x0100, 0x226]},
            0x04e8 : { 0x681d : [0x0222]},
            }
    EBOOK_DIR_MAIN = ['wordplayer/calibretransfer', 'eBooks/import', 'Books']
--- a/src/calibre/devices/prs505/books.py
+++ b/src/calibre/devices/prs505/books.py
@ -226,11 +226,19 @@ class BookList(_BookList):
        for item in collections:
            item = item.strip()
            mitem = getattr(mi, item, None)
            titems = []
            if mitem:
                if isinstance(mitem, list):
-                    tags.extend(mitem)
+                    titems = mitem
                else:
-                    tags.append(mitem)
+                    titems = [mitem]
                if item == 'tags' and titems:
                    litems = []
                    for i in titems:
                        if not i.strip().startswith('[') and not i.strip().endswith(']'):
                            litems.append(i)
                    titems = litems
                tags.extend(titems)
        if tags:
            tags = list(set(tags))
            if hasattr(mi, 'tag_order'):
--- a/src/calibre/devices/prs505/driver.py
+++ b/src/calibre/devices/prs505/driver.py
@ -150,7 +150,8 @@ class PRS505(CLI, Device):
        for location in locations:
            info = metadata.next()
            path = location[0]
-            blist = 2 if location[3] == 'cardb' else 1 if location[3] == 'carda' else 0
+            oncard = location[3]
            blist = 2 if oncard == 'cardb' else 1 if oncard == 'carda' else 0
            if self._main_prefix and path.startswith(self._main_prefix):
                name = path.replace(self._main_prefix, '')
@ -166,7 +167,11 @@ class PRS505(CLI, Device):
            opts = self.settings()
            collections = opts.extra_customization.split(',') if opts.extra_customization else []
-            booklists[blist].add_book(info, name, collections, *location[1:-1])
+            booklist = booklists[blist]
            if not hasattr(booklist, 'add_book'):
                raise ValueError(('Incorrect upload location %s. Did you choose the'
                        ' correct card A or B, to send books to?')%oncard)
            booklist.add_book(info, name, collections, *location[1:-1])
        fix_ids(*booklists)
    def delete_books(self, paths, end_session=True):
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -230,14 +230,25 @@ class HTMLPreProcessor(object):
        end_rules = []
        if getattr(self.extra_opts, 'remove_header', None):
-            end_rules.append(
+            try:
-                (re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '')
+                end_rules.append(
-            )
+                    (re.compile(self.extra_opts.header_regex), lambda match : '')
                )
            except:
                import traceback
                print 'Failed to parse remove_header regexp'
                traceback.print_exc()
        if getattr(self.extra_opts, 'remove_footer', None):
-            end_rules.append(
+            try:
-                (re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '')
+                end_rules.append(
-            )
+                    (re.compile(self.extra_opts.footer_regex), lambda match : '')
-        
+                )
            except:
                import traceback
                print 'Failed to parse remove_footer regexp'
                traceback.print_exc()
        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
            length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
            if length:
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@ -230,8 +230,8 @@ class FB2MLizer(object):
                if '://' in href:
                    fb2_text.append('<a xlink:href="%s">' % href)
                else:
-                    if '#' not in href:
+                    if href.startswith('#'):
-                        href += '#'
+                        href = href[1:]
                    if href not in self.link_hrefs.keys():
                        self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys())
                    href = self.link_hrefs[href]
--- a/src/calibre/ebooks/metadata/mobi.py
+++ b/src/calibre/ebooks/metadata/mobi.py
@ -12,6 +12,7 @@ __docformat__ = 'restructuredtext en'
 from struct import pack, unpack
 from cStringIO import StringIO
 from calibre.ebooks.conversion.config import load_defaults
 from calibre.ebooks.mobi import MobiError
 from calibre.ebooks.mobi.writer import rescale_image, MAX_THUMB_DIMEN
 from calibre.ebooks.mobi.langcodes import iana2mobi
@ -350,15 +351,10 @@ class MetadataUpdater(object):
            subjects = '; '.join(mi.tags)
            update_exth_record((105, subjects.encode(self.codec, 'replace')))
-            # >>> Begin patch for ticket #4652 <<<
+            prefs = load_defaults('mobi_output')
-            kindle_doc_types = set([u'[kindle_ebok]',u'[kindle_pdoc]'])
+            kindle_pdoc = prefs.get('personal_doc', None)
-            doc_type = list(kindle_doc_types.intersection(set(mi.tags)))[0]
+            if kindle_pdoc in mi.tags:
-            if doc_type:
+                update_exth_record((501, str('PDOC')))
                if doc_type == '[kindle_ebok]':
                    update_exth_record((501,str('EBOK')))
                elif doc_type == '[kindle_pdoc]':
                    update_exth_record((501, str('PDOC')))
            # >>> End patch
        if mi.pubdate:
            update_exth_record((106, str(mi.pubdate).encode(self.codec, 'replace')))
--- a/src/calibre/ebooks/metadata/topaz.py
+++ b/src/calibre/ebooks/metadata/topaz.py
@ -4,7 +4,7 @@ __copyright__ = '2010, Greg Riker <griker@hotmail.com>'
 __docformat__ = 'restructuredtext en'
 ''' Read/write metadata from Amazon's topaz format '''
-import os, StringIO, sys
+import StringIO, sys
 from struct import pack
 from calibre.ebooks.metadata import MetaInformation
@ -83,7 +83,7 @@ class MetadataUpdater(object):
        sig = self.data[:4]
        if not sig.startswith('TPZ'):
-            raise ValueError("'%s': unexpected Topaz signature '%s'" % (os.path.basename(stream.name),self.data[:4]))
+            raise ValueError("'%s': Not a Topaz file" % getattr(stream, 'name', 'Unnamed stream'))
        offset = 4
        self.header_records, consumed = self.decode_vwi(self.data[offset:offset+4])
@ -92,13 +92,13 @@ class MetadataUpdater(object):
        # First integrity test - metadata header
        if not 'metadata' in self.topaz_headers:
-            raise ValueError("'%s': Topaz metadata record missing" % os.path.basename(stream.name))
+            raise ValueError("'%s': Invalid Topaz format - no metadata record" % getattr(stream, 'name', 'Unnamed stream'))
        # Second integrity test - metadata body
        md_offset = self.topaz_headers['metadata']['blocks'][0]['offset']
        md_offset += self.base
        if self.data[md_offset+1:md_offset+9] != 'metadata':
-            raise ValueError("'%s': damaged Topaz metadata record" % os.path.basename(stream.name))
+            raise ValueError("'%s': Damaged metadata record" % getattr(stream, 'name', 'Unnamed stream'))
    def book_length(self):
        ''' convenience method for retrieving book length '''
--- a/src/calibre/ebooks/mobi/output.py
+++ b/src/calibre/ebooks/mobi/output.py
@ -36,6 +36,9 @@ class MOBIOutput(OutputFormatPlugin):
            recommended_value=False, level=OptionRecommendation.LOW,
            help=_('Disable compression of the file contents.')
        ),
        OptionRecommendation(name='personal_doc', recommended_value='[kindle_pdoc]',
            help=_('Tag marking book to be filed with Personal Docs')
        ),
    ])
    def check_for_periodical(self):
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -168,9 +168,9 @@ class BookHeader(object):
                    try:
                        self.exth.mi.language = mobi2iana(langid, sublangid)
                    except:
-                        self.log.exception('Unknown language code')
+                        self.log.exception("'%s': Unknown language code" % getattr(stream, 'name', 'Unnamed stream'))
                except:
-                    self.log.exception('Invalid EXTH header')
+                    self.log.exception("'%s': Invalid EXTH header" % getattr(stream, 'name', 'Unnamed stream'))
                    self.exth_flag = 0
@ -833,7 +833,7 @@ def get_metadata(stream):
    try:
        im = PILImage.open(buf)
    except:
-        log.exception("Failed to read MOBI cover: '%s'" % os.path.basename(stream.name))
+        log.exception("'%s': Failed to read MOBI cover" % getattr(stream, 'name', 'Unnamed stream'))
    else:
        obuf = cStringIO.StringIO()
        im.convert('RGB').save(obuf, format='JPEG')
--- a/src/calibre/ebooks/pml/pmlml.py
+++ b/src/calibre/ebooks/pml/pmlml.py
@ -260,8 +260,8 @@ class PMLMLizer(object):
                        href += '#'
                    if href not in self.link_hrefs.keys():
                        self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys())
-                    href = self.link_hrefs[href]
+                    href = '#%s' % self.link_hrefs[href]
-                    text.append('\\q="#%s"' % href)
+                text.append('\\q="%s"' % href)
                tags.append('q')
        # Anchor ids
--- a/src/calibre/gui2/convert/mobi_output.py
+++ b/src/calibre/gui2/convert/mobi_output.py
@ -24,7 +24,7 @@ class PluginWidget(Widget, Ui_Form):
    def __init__(self, parent, get_option, get_help, db=None, book_id=None):
        Widget.__init__(self, parent, 'mobi_output',
                ['prefer_author_sort', 'rescale_images', 'toc_title',
-                'dont_compress', 'no_inline_toc', 'masthead_font']
+                'dont_compress', 'no_inline_toc', 'masthead_font','personal_doc']
                )
        self.db, self.book_id = db, book_id
--- a/src/calibre/gui2/convert/mobi_output.ui
+++ b/src/calibre/gui2/convert/mobi_output.ui
@ -6,8 +6,8 @@
   <rect>
    <x>0</x>
    <y>0</y>
-    <width>421</width>
+    <width>521</width>
-    <height>300</height>
+    <height>331</height>
   </rect>
  </property>
  <property name="windowTitle">
@ -64,13 +64,27 @@
      <item>
       <widget class="QLabel" name="label_2">
        <property name="text">
-         <string>Masthead font:</string>
+         <string>Periodical masthead font:</string>
        </property>
       </widget>
      </item>
      <item>
       <widget class="QComboBox" name="opt_masthead_font"/>
      </item>
      <item>
       <layout class="QHBoxLayout" name="horizontalLayout">
        <item>
         <widget class="QLabel" name="label_3">
          <property name="text">
           <string>Personal Doc tag:</string>
          </property>
         </widget>
        </item>
        <item>
         <widget class="QLineEdit" name="opt_personal_doc"/>
        </item>
       </layout>
      </item>
      <item>
       <spacer name="verticalSpacer">
        <property name="orientation">
@ -79,7 +93,7 @@
        <property name="sizeHint" stdset="0">
         <size>
          <width>20</width>
-          <height>55</height>
+          <height>40</height>
         </size>
        </property>
       </spacer>
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -864,10 +864,10 @@ class BasicNewsRecipe(Recipe):
            self.log.error(_('Could not download cover: %s')%str(err))
            self.log.debug(traceback.format_exc())
        if cu is not None:
-            ext = cu.rpartition('.')[-1]
+            ext = cu.split('/')[-1].rpartition('.')[-1]
            if '?' in ext:
                ext = ''
-            ext = ext.lower() if ext else 'jpg'
+            ext = ext.lower() if ext and '/' not in ext else 'jpg'
            cpath = os.path.join(self.output_dir, 'cover.'+ext)
            if os.access(cu, os.R_OK):
                with open(cpath, 'wb') as cfile: