Merge branch 'master' of https://github.com/CoderAllan/calibre

2025-08-11 09:13:57 -04:00 · 2016-12-03 11:57:34 +05:30 · 2016-12-03 11:57:34 +05:30 · 96947fd01a
commit 96947fd01a
parent c925629fe7 a1810043c9
10 changed files with 54 additions and 665 deletions
--- a/recipes/edge_conversations.recipe
+++ b/recipes/edge_conversations.recipe
@ -18,35 +18,6 @@ class EdgeConversationRSS(BasicNewsRecipe):
    oldest_article = 60
    max_articles_per_feed = 100
    no_stylesheets = True
    auto_cleanup = True
-    keep_only_tags = [
+    feeds = [(u'Edge RSS', u'http://edge.org/feed')]
        dict(name='div', attrs={'class': 'HomeLeftPannel IMGCTRL'})]
    remove_tags = [
        dict(name='div', attrs={'class': 'Logo'})
    ]
    feeds = [(u'Edge RSS', u'http://edge.org/feeds/')]
    def print_version(self, url):
        return url.replace('conversation/', 'conversation.php?cid=')
    def parse_feeds(self):
        # Call parent's method.
        feeds = BasicNewsRecipe.parse_feeds(self)
        # Loop through all feeds.
        for feed in feeds:
            # Loop through all articles in feed.
            for article in feed.articles[:]:
                # Remove anything that is not a conversation, and remove PDF
                # files as well...
                if not ('CONVERSATION' in article.title):
                    feed.articles.remove(article)
                elif 'pdf' in article.url:
                    feed.articles.remove(article)
        return feeds
--- a/recipes/european_voice.recipe
+++ b/recipes/european_voice.recipe
@ -1,53 +0,0 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class EuropeanVoice(BasicNewsRecipe):
    title = u'European Voice'
    __author__ = 'malfi'
    oldest_article = 14
    max_articles_per_feed = 100
    no_stylesheets = True
    cover_url = 'http://www.europeanvoice.com/Css/images/logo.gif'
    language = 'en'
    keep_only_tags = [dict(name='div', attrs={'id': 'articleLeftColumn'})]
    remove_tags = [dict(name='div', attrs={'id': 'BreadCrump'})]
    feeds = [
        (u'Whole site ', u'http://www.europeanvoice.com/Rss/2.xml'),
        (u'News and analysis', u'http://www.europeanvoice.com/Rss/6.xml'),
        (u'Comment', u'http://www.europeanvoice.com/Rss/7.xml'),
        (u'Special reports', u'http://www.europeanvoice.com/Rss/5.xml'),
        (u'People', u'http://www.europeanvoice.com/Rss/8.xml'),
        (u'Career', u'http://www.europeanvoice.com/Rss/11.xml'),
        (u'Policies', u'http://www.europeanvoice.com/Rss/4.xml'),
        (u'EVents', u'http://www.europeanvoice.com/Rss/10.xml'),
        (u'Policies - Economics', u'http://www.europeanvoice.com/Rss/31.xml'),
        (u'Policies - Business', u'http://www.europeanvoice.com/Rss/19.xml'),
        (u'Policies - Trade', u'http://www.europeanvoice.com/Rss/25.xml'),
        (u'Policies - Information society',
         u'http://www.europeanvoice.com/Rss/20.xml'),
        (u'Policies - Energy', u'http://www.europeanvoice.com/Rss/15.xml'),
        (u'Policies - Transport', u'http://www.europeanvoice.com/Rss/18.xml'),
        (u'Policies - Climate change', u'http://www.europeanvoice.com/Rss/16.xml'),
        (u'Policies - Environment', u'http://www.europeanvoice.com/Rss/17.xml'),
        (u'Policies - Farming & food', u'http://www.europeanvoice.com/Rss/23.xml'),
        (u'Policies - Health & society', u'http://www.europeanvoice.com/Rss/24.xml'),
        (u'Policies - Justice', u'http://www.europeanvoice.com/Rss/29.xml'),
        (u'Policies - Foreign affairs', u'http://www.europeanvoice.com/Rss/27.xml')
    ]
    extra_css = '''
        h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
        h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
        p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
        body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
        '''
    def print_version(self, url):
        return url + '?bPrint=1'
    def preprocess_html(self, soup):
        denied = soup.findAll(True, text='Subscribers')
        if denied:
            raise Exception(
                'Article skipped, because content can only be seen with subscription')
        return soup
--- a/recipes/kitsapun.recipe
+++ b/recipes/kitsapun.recipe
@ -15,29 +15,17 @@ class Kitsapsun(BasicNewsRecipe):
    publisher = 'Scripps Interactive Newspapers Group'
    category = 'news, Kitsap county, USA'
    language = 'en'
-    oldest_article = 2
+    oldest_article = 7
-    max_articles_per_feed = 100
+    max_articles_per_feed = 50
    no_stylesheets = True
    encoding = 'cp1252'
    use_embedded_content = False
    auto_cleanup = True
-    conversion_options = {
+    feeds = [(u'News', u'http://www.kitsapsun.com/feeds/rss/news'),
-        'comments': description, 'tags': category, 'language': language, 'publisher': publisher
+             (u'Sports', u'http://www.kitsapsun.com/feeds/rss/sports'),
-    }
+             (u'Entertainment',
-
+              u'http://www.kitsapsun.com/feeds/rss/entertainment'),
-    keep_only_tags = [
+             (u'Lifestyles', u'http://www.kitsapsun.com/feeds/rss/lifestyle'),
-        dict(name='div', attrs={'id': ['story_meta', 'story_content']})]
+             (u'Opinion', u'http://www.kitsapsun.com/feeds/rss/opinion'),
-
+             ]
    remove_tags = [dict(name=['object', 'link', 'embed', 'form', 'iframe'])]
    feeds = [
    (u'News', u'http://www.kitsapsun.com/rss/headlines/news/'),
    (u'Business', u'http://www.kitsapsun.com/rss/headlines/business/'),
    (u'Communities', u'http://www.kitsapsun.com/rss/headlines/communities/'),
    (u'Entertainment', u'http://www.kitsapsun.com/rss/headlines/entertainment/'),
    (u'Lifestyles', u'http://www.kitsapsun.com/rss/headlines/lifestyles/')
    ]
    def print_version(self, url):
        return url.rpartition('/')[0] + '/?print=1'
--- a/recipes/mobilenations.recipe
+++ b/recipes/mobilenations.recipe
@ -14,5 +14,5 @@ class HindustanTimes(BasicNewsRecipe):
    feeds = [
        ('News',
-         'http://www.mobilenations.com/rss/mb.xml'),
+         'http://www.mobilenations.com/about?format=RSS'),
    ]
--- a/recipes/nme.recipe
+++ b/recipes/nme.recipe
@ -1,8 +1,4 @@
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre import browser
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.BeautifulSoup import Tag
 class AdvancedUserRecipe1306061239(BasicNewsRecipe):
@ -31,383 +27,10 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
    simultaneous_downloads = 20
    use_embedded_content = False
    recursions = 0
-
+    auto_cleanup = True
    conversion_options = {
        'comment': description, 'tags': category, 'publisher': publisher, 'language': language
    }
    feeds = [
-        (u'NME News', u'http://www.nme.com/rss/news'),
+        (u'NME News', u'http://www.nme.com/news/feed'),
-        (u'Reviews', u'http://www.nme.com/rss/reviews'),
+        (u'Reviews', u'http://www.nme.com/reviews/feed/'),
-        (u'Blogs', u'http://www.nme.com/rss/blogs'),
+        (u'Blogs', u'http://www.nme.com/blogs/feed'),
    ]
    keep_only_tags = [
        dict(name='div', attrs={'id': 'content'}),
    ]
    remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan',
                         'valign', 'vspace', 'hspace', 'alt', 'width', 'height']
    remove_tags = [
        dict(name='meta'),
        dict(name='span', attrs={'class': 'article_info'}),
        dict(name='div', attrs={'class': 'breadcrumbs'}),
        dict(name='div', attrs={'class': 'mugshot'}),
        dict(name='div', attrs={'class': 'header'}),
        dict(name='div', attrs={'class': re.compile(
            'youtube.*', re.IGNORECASE)}),
        dict(name='div', attrs={'class': re.compile(
            'socialbuttons.*', re.IGNORECASE)}),
        dict(name='div', attrs={'class': 'clear_both'}),
        dict(name='div', attrs={'class': re.compile(
            'headline.*', re.IGNORECASE)}),
        dict(name='div', attrs={'class': 'member-signedout'}),
        dict(name='div', attrs={'class': re.compile(
            'prev_next.*', re.IGNORECASE)}),
        dict(name='div', attrs={'class': re.compile(
            'article_related.*', re.IGNORECASE)}),
        dict(name='div', attrs={'class': re.compile(
            'feature_bar.*', re.IGNORECASE)}),
        dict(name='div', attrs={'class': re.compile('ebay.*', re.IGNORECASE)}),
        dict(name='div', attrs={'id': re.compile(
            'morenews.*', re.IGNORECASE)}),
        dict(name='div', attrs={'id': re.compile(
            'ticketspopup.*', re.IGNORECASE)}),
        dict(name='div', attrs={'id': re.compile(
            'ratemy_logprompt.*', re.IGNORECASE)}),
        dict(name='div', attrs={'id': re.compile(
            'related_artist.*', re.IGNORECASE)}),
        dict(name='img', attrs={'class': re.compile(
            'video_play_large.*', re.IGNORECASE)}),
        dict(name='ul', attrs={'class': re.compile(
            'prev_next.*', re.IGNORECASE)}),
        dict(name='ul', attrs={'class': re.compile(
            'nme_store.*', re.IGNORECASE)}),
        dict(name='p', attrs={'class': re.compile('top', re.IGNORECASE)}),
        dict(name='table', attrs={
             'class': re.compile('tickets.*', re.IGNORECASE)}),
    ]
    masthead_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg'
    def get_cover_url(self):
        magazine_page_raw = self.index_to_soup(
            'http://www.nme.com/magazine', raw=True)
        magazine_page_raw = re.sub(
            r'<script\b.+?</script>', '', magazine_page_raw, flags=re.DOTALL | re.IGNORECASE)
        magazine_page_raw = re.sub(
            r'\!\[if ', '!--[if ', magazine_page_raw, flags=re.DOTALL | re.IGNORECASE)
        magazine_page = self.index_to_soup(magazine_page_raw)
        cov = magazine_page.find('img', attrs={'class': 'magcover'})
        cov2 = str(cov['src'])
        br = browser()
        br.set_handle_redirect(False)
        try:
            br.open_novisit(cov2)
            cover_url = str(cov2)
        except:
            cover_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg'
        return cover_url
    def preprocess_raw_html(self, raw_html, url):
        '''
        Need this for a bug on site that prevents blogg post being parsed correctly
        '''
        raw_html = re.sub(r'\!\[if ', '!--[if ', raw_html,
                          flags=re.DOTALL | re.IGNORECASE)
        return raw_html
    def preprocess_html(self, soup):
        youtube_regex = re.compile(
            r'(?:youtube\.com/(?:[^/]+/.+/|(?:v|e(?:mbed)?)/|.*[?&]v=)|youtu\.be/)(?P<id>[^"&?/ ]{11})', re.DOTALL | re.IGNORECASE)
        instagram_regex = re.compile(
            r'.*?instagram.*?', re.DOTALL | re.IGNORECASE)
        twitter_regex = re.compile(r'.*?twitter.*?', re.DOTALL | re.IGNORECASE)
        visualise_regex = re.compile(
            r'.*?visualise.*?', re.DOTALL | re.IGNORECASE)
        soundcloud_regex = re.compile(
            r'(?P<url>.*?(w|api)\.soundcloud.*?com\/(tracks|playlists)\/\d{8,9})', re.DOTALL | re.IGNORECASE)
        dailymotion_regex = re.compile(
            r'.*?dailymotion.*?', re.DOTALL | re.IGNORECASE)
        spotify_regex = re.compile(r'.*?spotify.*?', re.DOTALL | re.IGNORECASE)
        vine_regex = re.compile(r'.*?vine.*?', re.DOTALL | re.IGNORECASE)
        doubleHtmlEntities = re.compile(
            ur'(&amp;)(?P<e>[\d\w\#]*;)', re.DOTALL | re.IGNORECASE | re.UNICODE)
        for iframe in soup.findAll('iframe'):
            if iframe.has_key('src') and youtube_regex.search(iframe['src']) is not None:  # noqa
                pq = Tag(soup, 'blockquote')
                br = Tag(soup, 'br')
                pq.insert(0, '[ YouTube ]')
                pq.insert(1, br)
                m = youtube_regex.search(iframe['src'])
                if m.group('id') is not None:
                    imgTag = Tag(soup, 'img', [
                                 ('src', 'http://img.youtube.com/vi/' + m.group('id') + '/0.jpg')])
                    pq.insert(len(pq.contents), imgTag)
                pq.insert(len(pq.contents), iframe['src'])
                iframe.replaceWith(pq)
            elif iframe.has_key('src') and soundcloud_regex.search(iframe['src']) is not None:  # noqa
                m = soundcloud_regex.search(iframe['src'])
                pq = Tag(soup, 'blockquote')
                br = Tag(soup, 'br')
                pq.insert(0, '[ SoundCloud ]')
                pq.insert(1, br)
                pq.insert(2, m.group('url'))
                iframe.replaceWith(pq)
            elif iframe.has_key('src') and dailymotion_regex.search(iframe['src']) is not None:  # noqa
                pq = Tag(soup, 'blockquote')
                br = Tag(soup, 'br')
                pq.insert(0, '[ DailyMotion ]')
                pq.insert(1, br)
                imgUrl = self.get_dailymotion_pic(iframe['src'])
                if imgUrl is not None:
                    imgTag = Tag(soup, 'img', [('src', imgUrl)])
                    pq.insert(len(pq.contents), imgTag)
                pq.insert(len(pq.contents), iframe['src'])
                iframe.replaceWith(pq)
            elif iframe.has_key('src') and spotify_regex.search(iframe['src']) is not None:  # noqa
                pq = Tag(soup, 'blockquote')
                br = Tag(soup, 'br')
                pq.insert(0, '[ Spotify ]')
                pq.insert(1, br)
                imgUrl = self.get_spotify_pic(iframe['src'])
                if imgUrl is not None:
                    imgTag = Tag(soup, 'img', [('src', imgUrl)])
                    pq.insert(len(pq.contents), imgTag)
                pq.insert(len(pq.contents), iframe['src'])
                iframe.replaceWith(pq)
            elif iframe.has_key('src') and vine_regex.search(iframe['src']) is not None:  # noqa
                pq = Tag(soup, 'blockquote')
                br = Tag(soup, 'br')
                pq.insert(0, '[ Vine ]')
                pq.insert(1, br)
                imgUrl = self.get_vine_pic(iframe['src'])
                if imgUrl is not None:
                    imgTag = Tag(soup, 'img', [('src', imgUrl)])
                    pq.insert(len(pq.contents), imgTag)
                pq.insert(len(pq.contents), iframe['src'])
                iframe.replaceWith(pq)
            elif iframe.has_key('src') and visualise_regex.search(iframe['src']) is not None:  # noqa
                imgUrl = self.get_visualise_pic(iframe['src'])
                if imgUrl is not None:
                    imgTag = Tag(soup, 'img', [('src', imgUrl)])
                    iframe.replaceWith(imgTag)
        for blockquote in soup.findAll('blockquote'):
            if blockquote.has_key('class') and twitter_regex.search(blockquote['class']) is not None:  # noqa
                pq = Tag(soup, 'blockquote')
                br = Tag(soup, 'br')
                pq.insert(0, '[ Twitter ]')
                pq.insert(len(pq.contents), br)
                match = re.search(
                    "(?P<url>pic\.twitter[^\s<]+)", str(blockquote))
                if match is not None:
                    img = self.get_twitter_pic(str(match.group("url")))
                    if img is not None:
                        pq.insert(len(pq.contents), img)
                for p in blockquote.findAll(name='p'):
                    x = 0
                    plen = len(p.contents)
                    while True:
                        c = len(pq.contents)
                        if p.contents[x].string is not None:
                            pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
                                2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE))
                        else:
                            pq.insert(c, p.contents[x].content)
                        x += 1
                        if x == plen:
                            break
                    br = Tag(soup, 'br')
                    pq.insert(len(pq.contents), br)
                    p.extract()
                if len(blockquote.contents) > 0:
                    x = 0
                    xlen = len(blockquote.contents)
                    while True:
                        c = len(pq.contents)
                        if blockquote.contents[x].string is not None:
                            pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
                                2), str(blockquote.contents[x].string), re.IGNORECASE | re.UNICODE))
                        else:
                            pq.insert(c, blockquote.contents[x].content)
                        x += 1
                        if x == xlen:
                            break
                blockquote.replaceWith(pq)
            elif blockquote.has_key('class') and instagram_regex.search(blockquote['class']) is not None:  # noqa
                pq = Tag(soup, 'blockquote')
                br = Tag(soup, 'br')
                pq.insert(0, '[ Instagram ]')
                pq.insert(1, br)
                a = blockquote.find(name='a', attrs={'href': instagram_regex})
                imgUrl = None
                if a is not None:
                    imgUrl = self.get_instagram_pic(str(a['href']))
                if imgUrl is not None:
                    img = Tag(soup, 'img', [('src', imgUrl)])
                    pq.insert(len(pq.contents), img)
                for p in blockquote.findAll(name='p'):
                    x = 0
                    plen = len(p.contents)
                    while x < plen:
                        c = len(pq.contents)
                        if p.contents[x].string is not None:
                            pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
                                2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE))
                        # else:
                            # pq.insert(c, p.contents[x].content)
                        x += 1
                    br = Tag(soup, 'br')
                    c = len(pq.contents)
                    pq.insert(c, br)
                blockquote.replaceWith(pq)
        for alink in soup.findAll('a'):
            if alink.string is not None:
                tstr = alink.string
                alink.replaceWith(tstr)
            elif alink.img is not None:
                tstr = alink.img
                alink.replaceWith(tstr)
            elif alink.span is not None:
                tstr = alink.span
                alink.replaceWith(tstr)
        return soup
    def get_visualise_pic(self, url):
        returnValue = None
        try:
            raw = self.browser.open(url).read()
        except:
            print '404: ' + url
            return returnValue
        bs = BeautifulSoup(raw)
        imgRaw = bs.find(name='meta', attrs={'property': 'og:image'})
        if imgRaw is not None:
            returnValue = str(imgRaw['content'])
        return returnValue
    def get_twitter_pic(self, url):
        returnValue = None
        try:
            raw = self.browser.open('https://' + url).read()
        except:
            print '404: ' + url
            return returnValue
        bs = BeautifulSoup(raw)
        refresh = bs.find('meta', {'http-equiv': 'refresh'})
        if refresh is not None:
            content = refresh.get('content').partition('=')[2]
            try:
                raw = self.browser.open(content).read()
            except:
                print '404: ' + url
                return returnValue
            bs = BeautifulSoup(raw)
        img = bs.find(name='img', attrs={
                      'alt': re.compile('.*permalink.*', re.IGNORECASE)})
        if img is not None:
            returnValue = img
        return returnValue
    def get_soundcloud_pic(self, url):
        # content loaded via javascript and require an login and/or registered application identification
        # returnValue = None
        # raw = self.browser.open(soundcloudUrl + '&visual=true').read()
        # bs = BeautifulSoup(raw)
        # imgRaw = bs.find(name='div', attrs={'style':re.compile(r'backgroud-image:*?',re.IGNORECASE)})
        # if imgRaw is not None:
            # returnValue = str(imgRaw['style'])
        return None  # returnValue
    def get_instagram_pic(self, url):
        returnValue = None
        try:
            raw = self.browser.open(url).read()
        except:
            print '404: ' + url
            return returnValue
        m = re.search('\"display_src\":\"(?P<url>http[s]?:.*?)\"', str(raw))
        if m is not None:
            returnValue = re.sub(r'\\', '', m.group(
                "url"), flags=re.DOTALL | re.IGNORECASE)
        return returnValue
    def get_dailymotion_pic(self, url):
        returnValue = None
        try:
            raw = self.browser.open(url).read()
        except:
            print '404: ' + url
            return returnValue
        m = re.search('("thumbnail_url\"\:\")(?P<url>http.*?)(\")', str(raw))
        if m is not None:
            returnValue = re.sub(r'\\', '', m.group(
                "url"), flags=re.DOTALL | re.IGNORECASE)
        return returnValue
    def get_spotify_pic(self, url):
        returnValue = None
        try:
            raw = self.browser.open(url).read()
        except:
            print '404: ' + url
            return returnValue
        m = re.search('data-ca=\"(?P<url>.*?)\"', str(raw))
        if m is not None:
            returnValue = m.group("url")
        return returnValue
    def get_vine_pic(self, url):
        returnValue = None
        try:
            raw = self.browser.open(url).read()
        except:
            print '404: ' + url
            return returnValue
        m = re.search('"thumbnail.*?src=\"(?P<url>.*?)\"', str(raw))
        if m is not None:
            returnValue = m.group("url")
        return returnValue
    preprocess_regexps = [
        (re.compile(r'<script\b.+?</script>', re.DOTALL | re.IGNORECASE), lambda h1: ''),
        (re.compile(r'<a.* id="buy-tickets-button".*</a>', re.IGNORECASE), lambda h2: ''),
        (re.compile(r'<a.* class="gallery.*</a>', re.IGNORECASE), lambda h2: ''),
    ]
    extra_css = '''
                    h1 h2 {
                        font-family:Arial,Helvetica,sans-serif;
                        font-weight:bold;font-size:large;
                    }
                    h3 {
                        font-family:Arial,Helvetica,sans-serif;
                        font-weight:normal;
                        font-size:small;
                        font-style:italic;
                        display:inline;
                    }
                    body {
                        font-family:Helvetica,Arial,sans-serif;
                        font-size:small;
                    }
                    blockquote {
                        font-family:"Courier New",
                        Courier, monospace;
                        font-size:90%;
                    }
                    img {
                        display:block;
                    }
                    .date{
                        font-style:italic;
                        font-weight:normal;
                    }
                    .article_header>p:not(.date){
                        font-weight:bold;
                    }
                '''
--- a/recipes/resurgence.recipe
+++ b/recipes/resurgence.recipe
@ -1,22 +0,0 @@
 __license__ = 'GPL v3'
 __copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
 from calibre.web.feeds.news import BasicNewsRecipe
 class TheResurgence(BasicNewsRecipe):
    title = u'The Resurgence'
    __author__ = 'Peter Grungi'
    language = 'en'
    oldest_article = 7
    max_articles_per_feed = 10
    auto_cleanup = True
    cover_url = 'http://cdn.theresurgence.com/images/logo.png'
    masthead_url = 'http://cdn.theresurgence.com/images/logo.png'
    language = 'en'
    publisher = 'The Resurgence'
    author = 'The Resurgence'
    feeds = [
        (u'The Resurgence', u'http://feeds.theresurgence.com/TheResurgence?format=xml')]
--- a/recipes/security_watch.recipe
+++ b/recipes/security_watch.recipe
@ -10,30 +10,10 @@ class SecurityWatch(BasicNewsRecipe):
    oldest_article = 14
    max_articles_per_feed = 100
    use_embedded_content = False
    filter_regexps = [r'feedads\.googleadservices\.com']
    filter_regexps = [r'ad\.doubleclick']
    filter_regexps = [r'advert']
    language = 'en'
-
+    auto_cleanup = True
    extra_css = 'div {text-align:left}'
    remove_tags = [dict(id='topBannerContainer'),
                   dict(id='topBannerSmall'),
                   dict(id='topSearchBar'),
                   dict(id='topSearchForm'),
                   dict(id='rtBannerMPU'),
                   dict(id='topNavBar'),
                   dict(id='breadcrumbs'),
                   # dict(id='entry-28272'),
                   dict(id='topSearchLinks'),
                   dict(name='span', attrs={'class': 'date'})]
    remove_tags_after = [dict(id='googlemp')]
    feeds = [
-        (u'securitywatch', u'http://feeds.ziffdavisenterprise.com/RSS/security_watch/')]
+        (u'securitywatch',
-
+         u'http://feeds.pcmag.com/Rss.aspx/SectionArticles?sectionId=28026')
-    def postprocess_html(self, soup, first_fetch):
+        ]
        for t in soup.findAll(['table', 'tr', 'td']):
            t.name = 'div'
        return soup
--- a/recipes/sign_on_sd.recipe
+++ b/recipes/sign_on_sd.recipe
@ -17,39 +17,38 @@ class AdvancedUserRecipe1315899507(BasicNewsRecipe):
    auto_cleanup = True
    remove_empty_feeds = True
    publication_type = 'newspaper'
    masthead_url = 'http://media.signonsandiego.com/e2/sosd/images/sosd_logo.png'
    feeds = [
-        (u'Latest News', u'http://www.signonsandiego.com/rss/headlines/'),
+        (u'Latest News',
-        (u'Local News', u'http://www.signonsandiego.com/rss/headlines/metro/'),
+         u'http://www.sandiegouniontribune.com/latest/rss2.0.xml'),
-        (u'Business', u'http://www.signonsandiego.com/rss/headlines/business/'),
+        (u'Business',
-        (u'Politics', u'http://www.signonsandiego.com/rss/headlines/local/politics/'),
+         u'http://www.sandiegouniontribune.com/business/rss2.0.xml'),
-        (u'Border & Immigration', u'http://www.signonsandiego.com/rss/headlines/border/'),
+        (u'Politics',
-        (u'Courts', u'http://www.signonsandiego.com/rss/headlines/courts/'),
+         u'http://www.sandiegouniontribune.com/news/politics/rss2.0.xml'),
-        (u'Education', u'http://www.signonsandiego.com/news/education/'),
+        (u'Immigration',
-        (u'Sports', u'http://www.signonsandiego.com/rss/headlines/sports/'),
+         u'http://www.sandiegouniontribune.com/news/immigration/rss2.0.xml'),
-        (u'Chargers', u'http://www.signonsandiego.com/rss/headlines/sports/chargers/'),
+        (u'Courts',
-        (u'Padres', u'http://www.signonsandiego.com/rss/headlines/sports/padres/'),
+         u'http://www.sandiegouniontribune.com/news/public-safety/rss2.0.xml'),
-        (u'NFL', u'http://www.signonsandiego.com/rss/headlines/sports/nfl/'),
+        (u'Education',
-        (u'NBA', u'http://www.signonsandiego.com/rss/headlines/sports/nba/'),
+         u'http://www.sandiegouniontribune.com/news/education/rss2.0.xml'),
-        (u'Nick Canepa', u'http://www.signonsandiego.com/rss/authors/nick-canepa/'),
+        (u'Sports',
-        (u'Tim Sullivan', u'http://www.signonsandiego.com/rss/authors/tim-sullivan/'),
+         u'http://www.sandiegouniontribune.com/sports/rss2.0.xml'),
-        (u'Ruben Navarrette', u'http://www.signonsandiego.com/rss/authors/ruben-navarrette/'),
+        (u'Chargers',
-        (u'Diane Bell', u'http://www.signonsandiego.com/rss/authors/diane-bell/'),
+         u'http://www.sandiegouniontribune.com/sports/chargers/rss2.0.xml'),
-        (u'Smart Living', u'http://www.signonsandiego.com/rss/headlines/smart-living/'),
+        (u'Padres',
-        (u'Photos', u'http://www.signonsandiego.com/rss/photos/'),
+         u'http://www.sandiegouniontribune.com/sports/padres/rss2.0.xml'),
-        (u'Arts', u'http://www.signonsandiego.com/rss/headlines/night-and-day/theater-arts/'),
+        (u'NFL',
-        (u'Books', u'http://www.signonsandiego.com/rss/headlines/lifestyle/books/'),
+         u'http://www.sandiegouniontribune.com/sports/nfl/rss2.0.xml'),
-        (u'Currents-Passages',
+        (u'NBA',
-         u'http://www.signonsandiego.com/rss/headlines/lifestyle/currents/passages/'),
+         u'http://www.sandiegouniontribune.com/sports/nba/rss2.0.xml'),
-        (u'Currents-Weekend',
+        (u'Photos',
-         u'http://www.signonsandiego.com/news/rss2/daily/currentsweekend.xml'),
+         u'http://www.sandiegouniontribune.com/visuals/rss2.0.xml'),
-        (u'Dialog', u'http://www.signonsandiego.com/news/rss2/daily/dialog.xml'),
+        (u'Entertainment',
-        (u'Home', u'http://www.signonsandiego.com/rss/headlines/home/'),
+         u'http://www.sandiegouniontribune.com/entertainment/rss2.0.xml'),
-        (u'Homescape', u'http://www.signonsandiego.com/rss/headlines/lifestyle/homescape/'),
+        (u'Books',
-        (u'Night & Day', u'http://www.signonsandiego.com/news/rss2/daily/nightday.xml'),
+         u'http://www.sandiegouniontribune.com/entertainment/books/rss2.0.xml'),
-        (u'Opinion', u'http://www.signonsandiego.com/rss/headlines/opinion/'),
+        (u'Opinion',
-        (u'Quest', u'http://www.signonsandiego.com/news/rss2/daily/quest.xml'),
+         u'http://www.sandiegouniontribune.com/opinion/rss2.0.xml'),
-        (u'Travel', u'http://www.signonsandiego.com/news/rss2/daily/travel.xml'),
+        (u'Travel',
-        (u'Wheels', u'http://www.signonsandiego.com/news/rss2/daily/wheels.xml')
+         u'http://www.sandiegouniontribune.com/lifestyle/travel/rss2.0.xml'),
    ]
--- a/recipes/staradvertiser.recipe
+++ b/recipes/staradvertiser.recipe
@ -28,5 +28,5 @@ class Starbulletin(BasicNewsRecipe):
        (u'Business', u'http://www.staradvertiser.com/business/feed/'),
        (u'Sports', u'http://www.staradvertiser.com/sports/feed/'),
        (u'Features',
-         u'http://www.staradvertiser.com/featurespremium/index.rss')
+         u'http://www.staradvertiser.com/features/feed/')
    ]
--- a/recipes/television_without_pity.recipe
+++ b/recipes/television_without_pity.recipe
@ -1,97 +0,0 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 import re
 class TelevisionWithoutPity(BasicNewsRecipe):
    title = u'Television Without Pity'
    language = 'en'
    __author__ = 'Snarkastica'
    # Used for pulling down an entire show, not just the RSS feed
    SHOW = 'http://www.televisionwithoutpity.com/show/SHOW-NAME-HERE/recaps/'
    oldest_article = 7  # days
    max_articles_per_feed = 25
    # reverse_article_order=True # Useful for entire show, to display in episode order
    use_embedded_content = False
    preprocess_regexps = [(re.compile(r'<span class="headline_recap_title .*?>',
                                      re.DOTALL | re.IGNORECASE), lambda match: '<span class="headline_recap_title">')]
    keep_only_tags = [dict(name='span', attrs={'class': 'headline_recap_title'}), dict(
        name='p', attrs={'class': 'byline'}), dict(name='div', attrs={'class': 'body_recap'}), dict(name='h1')]
    no_stylesheets = True
    # Comment this out and configure process_index() to retrieve a single show
    feeds = [
        ('Ltest Recaps',
         'http://www.televisionwithoutpity.com/rss.xml'),
    ]
    '''
    This method can be used to grab all recaps for a single show
    Set the SHOW constant at the beginning of this file to the URL for a show's recap page
    (the page listing all recaps, usually of the form:
    http://www.televisionwithoutpity.com/show/SHOW-NAME/recaps/"
    Where SHOW-NAME is the hyphenated name of the show.
    To use:
    1. Comment out feeds = [...] earlier in this file
    2. Set the SHOW constant to the show's recap page
    3. Uncomment the following function
    '''
    '''
    def parse_index(self):
        soup = self.index_to_soup(self.SHOW)
        feeds = []
        articles = []
        showTitle = soup.find('h1').string
        recaps = soup.find('table')
        for ep in recaps.findAll('tr'):
            epData = ep.findAll('td')
            epNum = epData[0].find(text=True).strip()
            if not epNum == "Ep.":
                epT = self.tag_to_string(epData[1].find('em')).strip()
                epST = " (or " + self.tag_to_string(epData[1].find('h3')).strip() + ")"
                epTitle = epNum + ": " + epT + epST
                epData[1].find('em').extract()
                epURL = epData[1].find('a', href=True)
                epURL = epURL['href']
                epSum = self.tag_to_string(epData[1].find('p')).strip()
                epDate = epData[2].find(text=True).strip()
                epAuthor = self.tag_to_string(epData[4].find('p')).strip()
                articles.append({'title':epTitle, 'url':epURL, 'description':epSum, 'date':epDate, 'author':epAuthor})
        feeds.append((showTitle, articles))
        #self.abort_recipe_processing("test")
        return feeds
    '''
    # This will add subsequent pages of multipage recaps to a single article
    # page
    def append_page(self, soup, appendtag, position):
        # If false, will still grab single-page recaplets
        if (soup.find('p', attrs={'class': 'pages'})):
            pager = soup.find('p', attrs={'class': 'pages'}).find(text='Next')
            if pager:
                nexturl = pager.parent['href']
                soup2 = self.index_to_soup(nexturl)
                texttag = soup2.find('div', attrs={'class': 'body_recap'})
                for it in texttag.findAll(style=True):
                    del it['style']
                newpos = len(texttag.contents)
                self.append_page(soup2, texttag, newpos)
                texttag.extract()
                appendtag.insert(position, texttag)
    def preprocess_html(self, soup):
        self.append_page(soup, soup.body, 3)
        return soup
    # Remove the multi page links (we had to keep these in for append_page(), but they can go away now
    # Could have used CSS to hide, but some readers ignore CSS.
    def postprocess_html(self, soup, first_fetch):
        paginator = soup.findAll('p', attrs={'class': 'pages'})
        if paginator:
            for p in paginator:
                p.extract()
                # TODO: Fix this so it converts the headline class into a heading 1
        return soup