From a1810043c9291ac44d367566df548e07d27ce924 Mon Sep 17 00:00:00 2001 From: Allan Simonsen Date: Sat, 3 Dec 2016 06:52:55 +0100 Subject: [PATCH] Fixed recipes. Deleted recipes where host does not exist anymore or has no rss feeds anymore. --- recipes/edge_conversations.recipe | 33 +-- recipes/european_voice.recipe | 53 ---- recipes/kitsapun.recipe | 32 +-- recipes/mobilenations.recipe | 2 +- recipes/nme.recipe | 381 +------------------------ recipes/resurgence.recipe | 22 -- recipes/security_watch.recipe | 28 +- recipes/sign_on_sd.recipe | 65 +++-- recipes/staradvertiser.recipe | 2 +- recipes/television_without_pity.recipe | 97 ------- 10 files changed, 54 insertions(+), 661 deletions(-) delete mode 100644 recipes/european_voice.recipe delete mode 100644 recipes/resurgence.recipe delete mode 100644 recipes/television_without_pity.recipe diff --git a/recipes/edge_conversations.recipe b/recipes/edge_conversations.recipe index 3ad25c5e8a..afeaffe6bd 100644 --- a/recipes/edge_conversations.recipe +++ b/recipes/edge_conversations.recipe @@ -18,35 +18,6 @@ class EdgeConversationRSS(BasicNewsRecipe): oldest_article = 60 max_articles_per_feed = 100 no_stylesheets = True + auto_cleanup = True - keep_only_tags = [ - dict(name='div', attrs={'class': 'HomeLeftPannel IMGCTRL'})] - remove_tags = [ - dict(name='div', attrs={'class': 'Logo'}) - ] - - feeds = [(u'Edge RSS', u'http://edge.org/feeds/')] - - def print_version(self, url): - return url.replace('conversation/', 'conversation.php?cid=') - - def parse_feeds(self): - - # Call parent's method. - feeds = BasicNewsRecipe.parse_feeds(self) - - # Loop through all feeds. - for feed in feeds: - - # Loop through all articles in feed. - for article in feed.articles[:]: - - # Remove anything that is not a conversation, and remove PDF - # files as well... - - if not ('CONVERSATION' in article.title): - feed.articles.remove(article) - elif 'pdf' in article.url: - feed.articles.remove(article) - - return feeds + feeds = [(u'Edge RSS', u'http://edge.org/feed')] diff --git a/recipes/european_voice.recipe b/recipes/european_voice.recipe deleted file mode 100644 index 44ae0d2b54..0000000000 --- a/recipes/european_voice.recipe +++ /dev/null @@ -1,53 +0,0 @@ - -from calibre.web.feeds.news import BasicNewsRecipe - - -class EuropeanVoice(BasicNewsRecipe): - title = u'European Voice' - __author__ = 'malfi' - oldest_article = 14 - max_articles_per_feed = 100 - no_stylesheets = True - cover_url = 'http://www.europeanvoice.com/Css/images/logo.gif' - language = 'en' - keep_only_tags = [dict(name='div', attrs={'id': 'articleLeftColumn'})] - remove_tags = [dict(name='div', attrs={'id': 'BreadCrump'})] - feeds = [ - (u'Whole site ', u'http://www.europeanvoice.com/Rss/2.xml'), - (u'News and analysis', u'http://www.europeanvoice.com/Rss/6.xml'), - (u'Comment', u'http://www.europeanvoice.com/Rss/7.xml'), - (u'Special reports', u'http://www.europeanvoice.com/Rss/5.xml'), - (u'People', u'http://www.europeanvoice.com/Rss/8.xml'), - (u'Career', u'http://www.europeanvoice.com/Rss/11.xml'), - (u'Policies', u'http://www.europeanvoice.com/Rss/4.xml'), - (u'EVents', u'http://www.europeanvoice.com/Rss/10.xml'), - (u'Policies - Economics', u'http://www.europeanvoice.com/Rss/31.xml'), - (u'Policies - Business', u'http://www.europeanvoice.com/Rss/19.xml'), - (u'Policies - Trade', u'http://www.europeanvoice.com/Rss/25.xml'), - (u'Policies - Information society', - u'http://www.europeanvoice.com/Rss/20.xml'), - (u'Policies - Energy', u'http://www.europeanvoice.com/Rss/15.xml'), - (u'Policies - Transport', u'http://www.europeanvoice.com/Rss/18.xml'), - (u'Policies - Climate change', u'http://www.europeanvoice.com/Rss/16.xml'), - (u'Policies - Environment', u'http://www.europeanvoice.com/Rss/17.xml'), - (u'Policies - Farming & food', u'http://www.europeanvoice.com/Rss/23.xml'), - (u'Policies - Health & society', u'http://www.europeanvoice.com/Rss/24.xml'), - (u'Policies - Justice', u'http://www.europeanvoice.com/Rss/29.xml'), - (u'Policies - Foreign affairs', u'http://www.europeanvoice.com/Rss/27.xml') - ] - extra_css = ''' - h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} - h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} - p{font-family:Arial,Helvetica,sans-serif;font-size:small;} - body{font-family:Helvetica,Arial,sans-serif;font-size:small;} - ''' - - def print_version(self, url): - return url + '?bPrint=1' - - def preprocess_html(self, soup): - denied = soup.findAll(True, text='Subscribers') - if denied: - raise Exception( - 'Article skipped, because content can only be seen with subscription') - return soup diff --git a/recipes/kitsapun.recipe b/recipes/kitsapun.recipe index 60a4a93aa4..55b9e5c4cd 100644 --- a/recipes/kitsapun.recipe +++ b/recipes/kitsapun.recipe @@ -15,29 +15,17 @@ class Kitsapsun(BasicNewsRecipe): publisher = 'Scripps Interactive Newspapers Group' category = 'news, Kitsap county, USA' language = 'en' - oldest_article = 2 - max_articles_per_feed = 100 + oldest_article = 7 + max_articles_per_feed = 50 no_stylesheets = True encoding = 'cp1252' use_embedded_content = False + auto_cleanup = True - conversion_options = { - 'comments': description, 'tags': category, 'language': language, 'publisher': publisher - } - - keep_only_tags = [ - dict(name='div', attrs={'id': ['story_meta', 'story_content']})] - - remove_tags = [dict(name=['object', 'link', 'embed', 'form', 'iframe'])] - - feeds = [ - - (u'News', u'http://www.kitsapsun.com/rss/headlines/news/'), - (u'Business', u'http://www.kitsapsun.com/rss/headlines/business/'), - (u'Communities', u'http://www.kitsapsun.com/rss/headlines/communities/'), - (u'Entertainment', u'http://www.kitsapsun.com/rss/headlines/entertainment/'), - (u'Lifestyles', u'http://www.kitsapsun.com/rss/headlines/lifestyles/') - ] - - def print_version(self, url): - return url.rpartition('/')[0] + '/?print=1' + feeds = [(u'News', u'http://www.kitsapsun.com/feeds/rss/news'), + (u'Sports', u'http://www.kitsapsun.com/feeds/rss/sports'), + (u'Entertainment', + u'http://www.kitsapsun.com/feeds/rss/entertainment'), + (u'Lifestyles', u'http://www.kitsapsun.com/feeds/rss/lifestyle'), + (u'Opinion', u'http://www.kitsapsun.com/feeds/rss/opinion'), + ] diff --git a/recipes/mobilenations.recipe b/recipes/mobilenations.recipe index 8053fa43ff..d4a13af10f 100644 --- a/recipes/mobilenations.recipe +++ b/recipes/mobilenations.recipe @@ -14,5 +14,5 @@ class HindustanTimes(BasicNewsRecipe): feeds = [ ('News', - 'http://www.mobilenations.com/rss/mb.xml'), + 'http://www.mobilenations.com/about?format=RSS'), ] diff --git a/recipes/nme.recipe b/recipes/nme.recipe index 0ee2c93257..8b2a331f59 100644 --- a/recipes/nme.recipe +++ b/recipes/nme.recipe @@ -31,383 +31,10 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe): simultaneous_downloads = 20 use_embedded_content = False recursions = 0 - - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language - } + auto_cleanup = True feeds = [ - (u'NME News', u'http://www.nme.com/rss/news'), - (u'Reviews', u'http://www.nme.com/rss/reviews'), - (u'Blogs', u'http://www.nme.com/rss/blogs'), + (u'NME News', u'http://www.nme.com/news/feed'), + (u'Reviews', u'http://www.nme.com/reviews/feed/'), + (u'Blogs', u'http://www.nme.com/blogs/feed'), ] - - keep_only_tags = [ - dict(name='div', attrs={'id': 'content'}), - ] - - remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', - 'valign', 'vspace', 'hspace', 'alt', 'width', 'height'] - - remove_tags = [ - dict(name='meta'), - dict(name='span', attrs={'class': 'article_info'}), - dict(name='div', attrs={'class': 'breadcrumbs'}), - dict(name='div', attrs={'class': 'mugshot'}), - dict(name='div', attrs={'class': 'header'}), - dict(name='div', attrs={'class': re.compile( - 'youtube.*', re.IGNORECASE)}), - dict(name='div', attrs={'class': re.compile( - 'socialbuttons.*', re.IGNORECASE)}), - dict(name='div', attrs={'class': 'clear_both'}), - dict(name='div', attrs={'class': re.compile( - 'headline.*', re.IGNORECASE)}), - dict(name='div', attrs={'class': 'member-signedout'}), - dict(name='div', attrs={'class': re.compile( - 'prev_next.*', re.IGNORECASE)}), - dict(name='div', attrs={'class': re.compile( - 'article_related.*', re.IGNORECASE)}), - dict(name='div', attrs={'class': re.compile( - 'feature_bar.*', re.IGNORECASE)}), - dict(name='div', attrs={'class': re.compile('ebay.*', re.IGNORECASE)}), - dict(name='div', attrs={'id': re.compile( - 'morenews.*', re.IGNORECASE)}), - dict(name='div', attrs={'id': re.compile( - 'ticketspopup.*', re.IGNORECASE)}), - dict(name='div', attrs={'id': re.compile( - 'ratemy_logprompt.*', re.IGNORECASE)}), - dict(name='div', attrs={'id': re.compile( - 'related_artist.*', re.IGNORECASE)}), - dict(name='img', attrs={'class': re.compile( - 'video_play_large.*', re.IGNORECASE)}), - dict(name='ul', attrs={'class': re.compile( - 'prev_next.*', re.IGNORECASE)}), - dict(name='ul', attrs={'class': re.compile( - 'nme_store.*', re.IGNORECASE)}), - dict(name='p', attrs={'class': re.compile('top', re.IGNORECASE)}), - dict(name='table', attrs={ - 'class': re.compile('tickets.*', re.IGNORECASE)}), - ] - - masthead_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg' - - def get_cover_url(self): - magazine_page_raw = self.index_to_soup( - 'http://www.nme.com/magazine', raw=True) - magazine_page_raw = re.sub( - r'', '', magazine_page_raw, flags=re.DOTALL | re.IGNORECASE) - magazine_page_raw = re.sub( - r'\!\[if ', '!--[if ', magazine_page_raw, flags=re.DOTALL | re.IGNORECASE) - magazine_page = self.index_to_soup(magazine_page_raw) - cov = magazine_page.find('img', attrs={'class': 'magcover'}) - - cov2 = str(cov['src']) - - br = browser() - br.set_handle_redirect(False) - try: - br.open_novisit(cov2) - cover_url = str(cov2) - except: - cover_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg' - return cover_url - - def preprocess_raw_html(self, raw_html, url): - ''' - Need this for a bug on site that prevents blogg post being parsed correctly - ''' - raw_html = re.sub(r'\!\[if ', '!--[if ', raw_html, - flags=re.DOTALL | re.IGNORECASE) - - return raw_html - - def preprocess_html(self, soup): - youtube_regex = re.compile( - r'(?:youtube\.com/(?:[^/]+/.+/|(?:v|e(?:mbed)?)/|.*[?&]v=)|youtu\.be/)(?P[^"&?/ ]{11})', re.DOTALL | re.IGNORECASE) - instagram_regex = re.compile( - r'.*?instagram.*?', re.DOTALL | re.IGNORECASE) - twitter_regex = re.compile(r'.*?twitter.*?', re.DOTALL | re.IGNORECASE) - visualise_regex = re.compile( - r'.*?visualise.*?', re.DOTALL | re.IGNORECASE) - soundcloud_regex = re.compile( - r'(?P.*?(w|api)\.soundcloud.*?com\/(tracks|playlists)\/\d{8,9})', re.DOTALL | re.IGNORECASE) - dailymotion_regex = re.compile( - r'.*?dailymotion.*?', re.DOTALL | re.IGNORECASE) - spotify_regex = re.compile(r'.*?spotify.*?', re.DOTALL | re.IGNORECASE) - vine_regex = re.compile(r'.*?vine.*?', re.DOTALL | re.IGNORECASE) - doubleHtmlEntities = re.compile( - ur'(&)(?P[\d\w\#]*;)', re.DOTALL | re.IGNORECASE | re.UNICODE) - for iframe in soup.findAll('iframe'): - if iframe.has_key('src') and youtube_regex.search(iframe['src']) is not None: # noqa - pq = Tag(soup, 'blockquote') - br = Tag(soup, 'br') - pq.insert(0, '[ YouTube ]') - pq.insert(1, br) - m = youtube_regex.search(iframe['src']) - if m.group('id') is not None: - imgTag = Tag(soup, 'img', [ - ('src', 'http://img.youtube.com/vi/' + m.group('id') + '/0.jpg')]) - pq.insert(len(pq.contents), imgTag) - pq.insert(len(pq.contents), iframe['src']) - iframe.replaceWith(pq) - elif iframe.has_key('src') and soundcloud_regex.search(iframe['src']) is not None: # noqa - m = soundcloud_regex.search(iframe['src']) - pq = Tag(soup, 'blockquote') - br = Tag(soup, 'br') - pq.insert(0, '[ SoundCloud ]') - pq.insert(1, br) - pq.insert(2, m.group('url')) - iframe.replaceWith(pq) - elif iframe.has_key('src') and dailymotion_regex.search(iframe['src']) is not None: # noqa - pq = Tag(soup, 'blockquote') - br = Tag(soup, 'br') - pq.insert(0, '[ DailyMotion ]') - pq.insert(1, br) - imgUrl = self.get_dailymotion_pic(iframe['src']) - if imgUrl is not None: - imgTag = Tag(soup, 'img', [('src', imgUrl)]) - pq.insert(len(pq.contents), imgTag) - pq.insert(len(pq.contents), iframe['src']) - iframe.replaceWith(pq) - elif iframe.has_key('src') and spotify_regex.search(iframe['src']) is not None: # noqa - pq = Tag(soup, 'blockquote') - br = Tag(soup, 'br') - pq.insert(0, '[ Spotify ]') - pq.insert(1, br) - imgUrl = self.get_spotify_pic(iframe['src']) - if imgUrl is not None: - imgTag = Tag(soup, 'img', [('src', imgUrl)]) - pq.insert(len(pq.contents), imgTag) - pq.insert(len(pq.contents), iframe['src']) - iframe.replaceWith(pq) - elif iframe.has_key('src') and vine_regex.search(iframe['src']) is not None: # noqa - pq = Tag(soup, 'blockquote') - br = Tag(soup, 'br') - pq.insert(0, '[ Vine ]') - pq.insert(1, br) - imgUrl = self.get_vine_pic(iframe['src']) - if imgUrl is not None: - imgTag = Tag(soup, 'img', [('src', imgUrl)]) - pq.insert(len(pq.contents), imgTag) - pq.insert(len(pq.contents), iframe['src']) - iframe.replaceWith(pq) - elif iframe.has_key('src') and visualise_regex.search(iframe['src']) is not None: # noqa - imgUrl = self.get_visualise_pic(iframe['src']) - if imgUrl is not None: - imgTag = Tag(soup, 'img', [('src', imgUrl)]) - iframe.replaceWith(imgTag) - for blockquote in soup.findAll('blockquote'): - if blockquote.has_key('class') and twitter_regex.search(blockquote['class']) is not None: # noqa - pq = Tag(soup, 'blockquote') - br = Tag(soup, 'br') - pq.insert(0, '[ Twitter ]') - pq.insert(len(pq.contents), br) - match = re.search( - "(?Ppic\.twitter[^\s<]+)", str(blockquote)) - if match is not None: - img = self.get_twitter_pic(str(match.group("url"))) - if img is not None: - pq.insert(len(pq.contents), img) - for p in blockquote.findAll(name='p'): - x = 0 - plen = len(p.contents) - while True: - c = len(pq.contents) - if p.contents[x].string is not None: - pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group( - 2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE)) - else: - pq.insert(c, p.contents[x].content) - x += 1 - if x == plen: - break - br = Tag(soup, 'br') - pq.insert(len(pq.contents), br) - p.extract() - if len(blockquote.contents) > 0: - x = 0 - xlen = len(blockquote.contents) - while True: - c = len(pq.contents) - if blockquote.contents[x].string is not None: - pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group( - 2), str(blockquote.contents[x].string), re.IGNORECASE | re.UNICODE)) - else: - pq.insert(c, blockquote.contents[x].content) - x += 1 - if x == xlen: - break - blockquote.replaceWith(pq) - elif blockquote.has_key('class') and instagram_regex.search(blockquote['class']) is not None: # noqa - pq = Tag(soup, 'blockquote') - br = Tag(soup, 'br') - pq.insert(0, '[ Instagram ]') - pq.insert(1, br) - a = blockquote.find(name='a', attrs={'href': instagram_regex}) - imgUrl = None - if a is not None: - imgUrl = self.get_instagram_pic(str(a['href'])) - if imgUrl is not None: - img = Tag(soup, 'img', [('src', imgUrl)]) - pq.insert(len(pq.contents), img) - for p in blockquote.findAll(name='p'): - x = 0 - plen = len(p.contents) - while x < plen: - c = len(pq.contents) - if p.contents[x].string is not None: - pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group( - 2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE)) - # else: - # pq.insert(c, p.contents[x].content) - x += 1 - br = Tag(soup, 'br') - c = len(pq.contents) - pq.insert(c, br) - blockquote.replaceWith(pq) - for alink in soup.findAll('a'): - if alink.string is not None: - tstr = alink.string - alink.replaceWith(tstr) - elif alink.img is not None: - tstr = alink.img - alink.replaceWith(tstr) - elif alink.span is not None: - tstr = alink.span - alink.replaceWith(tstr) - return soup - - def get_visualise_pic(self, url): - returnValue = None - try: - raw = self.browser.open(url).read() - except: - print '404: ' + url - return returnValue - bs = BeautifulSoup(raw) - imgRaw = bs.find(name='meta', attrs={'property': 'og:image'}) - if imgRaw is not None: - returnValue = str(imgRaw['content']) - return returnValue - - def get_twitter_pic(self, url): - returnValue = None - try: - raw = self.browser.open('https://' + url).read() - except: - print '404: ' + url - return returnValue - bs = BeautifulSoup(raw) - refresh = bs.find('meta', {'http-equiv': 'refresh'}) - if refresh is not None: - content = refresh.get('content').partition('=')[2] - try: - raw = self.browser.open(content).read() - except: - print '404: ' + url - return returnValue - bs = BeautifulSoup(raw) - img = bs.find(name='img', attrs={ - 'alt': re.compile('.*permalink.*', re.IGNORECASE)}) - if img is not None: - returnValue = img - return returnValue - - def get_soundcloud_pic(self, url): - # content loaded via javascript and require an login and/or registered application identification - # returnValue = None - # raw = self.browser.open(soundcloudUrl + '&visual=true').read() - # bs = BeautifulSoup(raw) - # imgRaw = bs.find(name='div', attrs={'style':re.compile(r'backgroud-image:*?',re.IGNORECASE)}) - # if imgRaw is not None: - # returnValue = str(imgRaw['style']) - return None # returnValue - - def get_instagram_pic(self, url): - returnValue = None - try: - raw = self.browser.open(url).read() - except: - print '404: ' + url - return returnValue - m = re.search('\"display_src\":\"(?Phttp[s]?:.*?)\"', str(raw)) - if m is not None: - returnValue = re.sub(r'\\', '', m.group( - "url"), flags=re.DOTALL | re.IGNORECASE) - return returnValue - - def get_dailymotion_pic(self, url): - returnValue = None - try: - raw = self.browser.open(url).read() - except: - print '404: ' + url - return returnValue - m = re.search('("thumbnail_url\"\:\")(?Phttp.*?)(\")', str(raw)) - if m is not None: - returnValue = re.sub(r'\\', '', m.group( - "url"), flags=re.DOTALL | re.IGNORECASE) - return returnValue - - def get_spotify_pic(self, url): - returnValue = None - try: - raw = self.browser.open(url).read() - except: - print '404: ' + url - return returnValue - m = re.search('data-ca=\"(?P.*?)\"', str(raw)) - if m is not None: - returnValue = m.group("url") - return returnValue - - def get_vine_pic(self, url): - returnValue = None - try: - raw = self.browser.open(url).read() - except: - print '404: ' + url - return returnValue - m = re.search('"thumbnail.*?src=\"(?P.*?)\"', str(raw)) - if m is not None: - returnValue = m.group("url") - return returnValue - - preprocess_regexps = [ - (re.compile(r'', re.DOTALL | re.IGNORECASE), lambda h1: ''), - (re.compile(r'', re.IGNORECASE), lambda h2: ''), - (re.compile(r'p:not(.date){ - font-weight:bold; - } - ''' diff --git a/recipes/resurgence.recipe b/recipes/resurgence.recipe deleted file mode 100644 index 87ab2bfd96..0000000000 --- a/recipes/resurgence.recipe +++ /dev/null @@ -1,22 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2012, Peter Grungi

' - -from calibre.web.feeds.news import BasicNewsRecipe - - -class TheResurgence(BasicNewsRecipe): - title = u'The Resurgence' - __author__ = 'Peter Grungi' - language = 'en' - - oldest_article = 7 - max_articles_per_feed = 10 - auto_cleanup = True - cover_url = 'http://cdn.theresurgence.com/images/logo.png' - masthead_url = 'http://cdn.theresurgence.com/images/logo.png' - language = 'en' - publisher = 'The Resurgence' - author = 'The Resurgence' - - feeds = [ - (u'The Resurgence', u'http://feeds.theresurgence.com/TheResurgence?format=xml')] diff --git a/recipes/security_watch.recipe b/recipes/security_watch.recipe index 4780f549c4..13c17f10b1 100644 --- a/recipes/security_watch.recipe +++ b/recipes/security_watch.recipe @@ -10,30 +10,10 @@ class SecurityWatch(BasicNewsRecipe): oldest_article = 14 max_articles_per_feed = 100 use_embedded_content = False - filter_regexps = [r'feedads\.googleadservices\.com'] - filter_regexps = [r'ad\.doubleclick'] - filter_regexps = [r'advert'] language = 'en' - - extra_css = 'div {text-align:left}' - - remove_tags = [dict(id='topBannerContainer'), - dict(id='topBannerSmall'), - dict(id='topSearchBar'), - dict(id='topSearchForm'), - dict(id='rtBannerMPU'), - dict(id='topNavBar'), - dict(id='breadcrumbs'), - # dict(id='entry-28272'), - dict(id='topSearchLinks'), - dict(name='span', attrs={'class': 'date'})] - - remove_tags_after = [dict(id='googlemp')] + auto_cleanup = True feeds = [ - (u'securitywatch', u'http://feeds.ziffdavisenterprise.com/RSS/security_watch/')] - - def postprocess_html(self, soup, first_fetch): - for t in soup.findAll(['table', 'tr', 'td']): - t.name = 'div' - return soup + (u'securitywatch', + u'http://feeds.pcmag.com/Rss.aspx/SectionArticles?sectionId=28026') + ] diff --git a/recipes/sign_on_sd.recipe b/recipes/sign_on_sd.recipe index df942142b2..aefe92b71b 100644 --- a/recipes/sign_on_sd.recipe +++ b/recipes/sign_on_sd.recipe @@ -17,39 +17,38 @@ class AdvancedUserRecipe1315899507(BasicNewsRecipe): auto_cleanup = True remove_empty_feeds = True publication_type = 'newspaper' - masthead_url = 'http://media.signonsandiego.com/e2/sosd/images/sosd_logo.png' feeds = [ - (u'Latest News', u'http://www.signonsandiego.com/rss/headlines/'), - (u'Local News', u'http://www.signonsandiego.com/rss/headlines/metro/'), - (u'Business', u'http://www.signonsandiego.com/rss/headlines/business/'), - (u'Politics', u'http://www.signonsandiego.com/rss/headlines/local/politics/'), - (u'Border & Immigration', u'http://www.signonsandiego.com/rss/headlines/border/'), - (u'Courts', u'http://www.signonsandiego.com/rss/headlines/courts/'), - (u'Education', u'http://www.signonsandiego.com/news/education/'), - (u'Sports', u'http://www.signonsandiego.com/rss/headlines/sports/'), - (u'Chargers', u'http://www.signonsandiego.com/rss/headlines/sports/chargers/'), - (u'Padres', u'http://www.signonsandiego.com/rss/headlines/sports/padres/'), - (u'NFL', u'http://www.signonsandiego.com/rss/headlines/sports/nfl/'), - (u'NBA', u'http://www.signonsandiego.com/rss/headlines/sports/nba/'), - (u'Nick Canepa', u'http://www.signonsandiego.com/rss/authors/nick-canepa/'), - (u'Tim Sullivan', u'http://www.signonsandiego.com/rss/authors/tim-sullivan/'), - (u'Ruben Navarrette', u'http://www.signonsandiego.com/rss/authors/ruben-navarrette/'), - (u'Diane Bell', u'http://www.signonsandiego.com/rss/authors/diane-bell/'), - (u'Smart Living', u'http://www.signonsandiego.com/rss/headlines/smart-living/'), - (u'Photos', u'http://www.signonsandiego.com/rss/photos/'), - (u'Arts', u'http://www.signonsandiego.com/rss/headlines/night-and-day/theater-arts/'), - (u'Books', u'http://www.signonsandiego.com/rss/headlines/lifestyle/books/'), - (u'Currents-Passages', - u'http://www.signonsandiego.com/rss/headlines/lifestyle/currents/passages/'), - (u'Currents-Weekend', - u'http://www.signonsandiego.com/news/rss2/daily/currentsweekend.xml'), - (u'Dialog', u'http://www.signonsandiego.com/news/rss2/daily/dialog.xml'), - (u'Home', u'http://www.signonsandiego.com/rss/headlines/home/'), - (u'Homescape', u'http://www.signonsandiego.com/rss/headlines/lifestyle/homescape/'), - (u'Night & Day', u'http://www.signonsandiego.com/news/rss2/daily/nightday.xml'), - (u'Opinion', u'http://www.signonsandiego.com/rss/headlines/opinion/'), - (u'Quest', u'http://www.signonsandiego.com/news/rss2/daily/quest.xml'), - (u'Travel', u'http://www.signonsandiego.com/news/rss2/daily/travel.xml'), - (u'Wheels', u'http://www.signonsandiego.com/news/rss2/daily/wheels.xml') + (u'Latest News', + u'http://www.sandiegouniontribune.com/latest/rss2.0.xml'), + (u'Business', + u'http://www.sandiegouniontribune.com/business/rss2.0.xml'), + (u'Politics', + u'http://www.sandiegouniontribune.com/news/politics/rss2.0.xml'), + (u'Immigration', + u'http://www.sandiegouniontribune.com/news/immigration/rss2.0.xml'), + (u'Courts', + u'http://www.sandiegouniontribune.com/news/public-safety/rss2.0.xml'), + (u'Education', + u'http://www.sandiegouniontribune.com/news/education/rss2.0.xml'), + (u'Sports', + u'http://www.sandiegouniontribune.com/sports/rss2.0.xml'), + (u'Chargers', + u'http://www.sandiegouniontribune.com/sports/chargers/rss2.0.xml'), + (u'Padres', + u'http://www.sandiegouniontribune.com/sports/padres/rss2.0.xml'), + (u'NFL', + u'http://www.sandiegouniontribune.com/sports/nfl/rss2.0.xml'), + (u'NBA', + u'http://www.sandiegouniontribune.com/sports/nba/rss2.0.xml'), + (u'Photos', + u'http://www.sandiegouniontribune.com/visuals/rss2.0.xml'), + (u'Entertainment', + u'http://www.sandiegouniontribune.com/entertainment/rss2.0.xml'), + (u'Books', + u'http://www.sandiegouniontribune.com/entertainment/books/rss2.0.xml'), + (u'Opinion', + u'http://www.sandiegouniontribune.com/opinion/rss2.0.xml'), + (u'Travel', + u'http://www.sandiegouniontribune.com/lifestyle/travel/rss2.0.xml'), ] diff --git a/recipes/staradvertiser.recipe b/recipes/staradvertiser.recipe index 91e285d8e6..936b247448 100644 --- a/recipes/staradvertiser.recipe +++ b/recipes/staradvertiser.recipe @@ -28,5 +28,5 @@ class Starbulletin(BasicNewsRecipe): (u'Business', u'http://www.staradvertiser.com/business/feed/'), (u'Sports', u'http://www.staradvertiser.com/sports/feed/'), (u'Features', - u'http://www.staradvertiser.com/featurespremium/index.rss') + u'http://www.staradvertiser.com/features/feed/') ] diff --git a/recipes/television_without_pity.recipe b/recipes/television_without_pity.recipe deleted file mode 100644 index 66c96aa77a..0000000000 --- a/recipes/television_without_pity.recipe +++ /dev/null @@ -1,97 +0,0 @@ -from calibre.web.feeds.news import BasicNewsRecipe -import re - - -class TelevisionWithoutPity(BasicNewsRecipe): - title = u'Television Without Pity' - language = 'en' - __author__ = 'Snarkastica' - # Used for pulling down an entire show, not just the RSS feed - SHOW = 'http://www.televisionwithoutpity.com/show/SHOW-NAME-HERE/recaps/' - oldest_article = 7 # days - max_articles_per_feed = 25 - # reverse_article_order=True # Useful for entire show, to display in episode order - use_embedded_content = False - - preprocess_regexps = [(re.compile(r'')] - keep_only_tags = [dict(name='span', attrs={'class': 'headline_recap_title'}), dict( - name='p', attrs={'class': 'byline'}), dict(name='div', attrs={'class': 'body_recap'}), dict(name='h1')] - no_stylesheets = True - - # Comment this out and configure process_index() to retrieve a single show - feeds = [ - ('Ltest Recaps', - 'http://www.televisionwithoutpity.com/rss.xml'), - ] - - ''' - This method can be used to grab all recaps for a single show - Set the SHOW constant at the beginning of this file to the URL for a show's recap page - (the page listing all recaps, usually of the form: - http://www.televisionwithoutpity.com/show/SHOW-NAME/recaps/" - Where SHOW-NAME is the hyphenated name of the show. - - To use: - 1. Comment out feeds = [...] earlier in this file - 2. Set the SHOW constant to the show's recap page - 3. Uncomment the following function - ''' - - ''' - def parse_index(self): - soup = self.index_to_soup(self.SHOW) - feeds = [] - articles = [] - showTitle = soup.find('h1').string - recaps = soup.find('table') - for ep in recaps.findAll('tr'): - epData = ep.findAll('td') - epNum = epData[0].find(text=True).strip() - if not epNum == "Ep.": - epT = self.tag_to_string(epData[1].find('em')).strip() - epST = " (or " + self.tag_to_string(epData[1].find('h3')).strip() + ")" - epTitle = epNum + ": " + epT + epST - epData[1].find('em').extract() - epURL = epData[1].find('a', href=True) - epURL = epURL['href'] - epSum = self.tag_to_string(epData[1].find('p')).strip() - epDate = epData[2].find(text=True).strip() - epAuthor = self.tag_to_string(epData[4].find('p')).strip() - articles.append({'title':epTitle, 'url':epURL, 'description':epSum, 'date':epDate, 'author':epAuthor}) - feeds.append((showTitle, articles)) - #self.abort_recipe_processing("test") - return feeds - ''' - - # This will add subsequent pages of multipage recaps to a single article - # page - def append_page(self, soup, appendtag, position): - # If false, will still grab single-page recaplets - if (soup.find('p', attrs={'class': 'pages'})): - pager = soup.find('p', attrs={'class': 'pages'}).find(text='Next') - if pager: - nexturl = pager.parent['href'] - soup2 = self.index_to_soup(nexturl) - texttag = soup2.find('div', attrs={'class': 'body_recap'}) - for it in texttag.findAll(style=True): - del it['style'] - newpos = len(texttag.contents) - self.append_page(soup2, texttag, newpos) - texttag.extract() - appendtag.insert(position, texttag) - - def preprocess_html(self, soup): - self.append_page(soup, soup.body, 3) - return soup - - # Remove the multi page links (we had to keep these in for append_page(), but they can go away now - # Could have used CSS to hide, but some readers ignore CSS. - def postprocess_html(self, soup, first_fetch): - paginator = soup.findAll('p', attrs={'class': 'pages'}) - if paginator: - for p in paginator: - p.extract() - - # TODO: Fix this so it converts the headline class into a heading 1 - return soup