diff --git a/recipes/moscow_times.recipe b/recipes/moscow_times.recipe index 0b672f048e..dff09399d9 100644 --- a/recipes/moscow_times.recipe +++ b/recipes/moscow_times.recipe @@ -19,54 +19,14 @@ class Moscowtimes(BasicNewsRecipe): no_stylesheets = True use_embedded_content = False remove_empty_feeds = True - encoding = 'cp1251' - masthead_url = 'http://www.themoscowtimes.com/bitrix/templates/tmt/img/logo.gif' publication_type = 'newspaper' + auto_cleanup = True - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language - } - - extra_css = ''' - h1{ color:#0066B3; font-family: Georgia,serif ; font-size: large} - .article_date{ font-family:Arial,Tahoma,Verdana,Helvetica,sans-serif ; color:#000000; font-size: x-small;} - .autors{color:#999999 ; font-weight: bold ; font-family:Arial,Tahoma,Verdana,Helvetica,sans-serif ; font-size: x-small; } - .photoautors{ color:#999999 ; font-family:Arial,Tahoma,Verdana,Helvetica,sans-serif ; font-size: x-small; } - .text{font-family:Arial,Tahoma,Verdana,Helvetica,sans-serif ; font-size:75%; } - ''' feeds = [ - - (u'Top Stories', u'http://www.themoscowtimes.com/rss/top'), - (u'Current Issue', u'http://www.themoscowtimes.com/rss/issue'), - (u'News', u'http://www.themoscowtimes.com/rss/news'), - (u'Business', u'http://www.themoscowtimes.com/rss/business'), - (u'Art and Ideas', u'http://www.themoscowtimes.com/rss/art'), - (u'Opinion', u'http://www.themoscowtimes.com/rss/opinion') + (u'Top Stories', u'https://themoscowtimes.com/feeds/main.xml'), + (u'Moscow', u'https://themoscowtimes.com/feeds/moscow.xml'), + (u'Russia', u'https://themoscowtimes.com/feeds/russia.xml'), + (u'World', u'https://themoscowtimes.com/feeds/world.xml'), + (u'Business', u'https://themoscowtimes.com/feeds/business.xml'), + (u'Opinion', u'https://themoscowtimes.com/feeds/opinion.xml') ] - - keep_only_tags = [dict(name='div', attrs={'id': 'content'})] - remove_tags = [ - dict(name='div', attrs={'class': ['photo_nav', 'phototext']}), dict( - name=['iframe', 'meta', 'base', 'link', 'embed', 'object']) - ] - - def preprocess_html(self, soup): - for lnk in soup.findAll('a'): - if lnk.string is not None: - ind = self.tag_to_string(lnk) - lnk.replaceWith(ind) - return soup - - def print_version(self, url): - return url.replace('.themoscowtimes.com/', '.themoscowtimes.com/print/') - - def get_cover_url(self): - cover_url = None - href = 'http://www.themoscowtimes.com/pdf/' - soup = self.index_to_soup(href) - div = soup.find('div', attrs={'class': 'left'}) - if div: - a = div.find('a') - if a: - cover_url = 'http://www.themoscowtimes.com' + a.img['src'] - return cover_url diff --git a/recipes/newsstraitstimes.recipe b/recipes/newsstraitstimes.recipe index 46558fb392..3b9988974d 100644 --- a/recipes/newsstraitstimes.recipe +++ b/recipes/newsstraitstimes.recipe @@ -1,4 +1,3 @@ - __license__ = 'GPL v3' __copyright__ = '2010, Darko Miletic ' ''' @@ -11,7 +10,8 @@ from calibre.web.feeds.news import BasicNewsRecipe class Newstraitstimes(BasicNewsRecipe): title = 'New Straits Times from Malaysia' __author__ = 'Darko Miletic' - description = 'Learning Curve, Sunday People, New Straits Times from Malaysia' + description = ('Learning Curve, Sunday People, ' + 'New Straits Times from Malaysia') publisher = 'nst.com.my' category = 'news, politics, Malaysia' oldest_article = 2 @@ -20,13 +20,6 @@ class Newstraitstimes(BasicNewsRecipe): encoding = 'cp1252' use_embedded_content = False language = 'en' - masthead_url = 'http://www.nst.com.my/Current_News/NST/Images/new-nstonline.jpg' + auto_cleanup = True - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language - } - - remove_tags = [dict(name=['link', 'table'])] - keep_only_tags = dict(name='div', attrs={'id': 'haidah'}) - - feeds = [(u'Articles', u'http://www.nst.com.my/rss/allSec')] + feeds = [(u'Articles', u'http://www.nst.com.my/latest.xml')] diff --git a/recipes/oldnewthing.recipe b/recipes/oldnewthing.recipe index 9dcd84e765..f1f6942174 100644 --- a/recipes/oldnewthing.recipe +++ b/recipes/oldnewthing.recipe @@ -19,14 +19,6 @@ class OldNewThing(BasicNewsRecipe): no_stylesheets = True use_embedded_content = False publication_type = 'blog' - extra_css = ' body{font-family: Verdana,Arial,Helvetica,sans-serif} .code{font-family: "Lucida Console",monospace} ' + auto_cleanup = True - conversion_options = { - 'comment': description, 'tags': 'blog, windows, microsoft, programming', 'publisher': 'Raymond Chen', 'language': language - } - - remove_attributes = ['width', 'height'] - keep_only_tags = [dict(attrs={'class': 'full-post'})] - remove_tags = [ - dict(attrs={'class': ['post-attributes', 'post-tags', 'post-actions']})] - feeds = [(u'Posts', u'http://blogs.msdn.com/oldnewthing/rss.xml')] + feeds = [(u'Posts', u'https://blogs.msdn.microsoft.com/oldnewthing/feed')] diff --git a/recipes/pc_advisor.recipe b/recipes/pc_advisor.recipe index 7067d93252..295a4d2ce2 100644 --- a/recipes/pc_advisor.recipe +++ b/recipes/pc_advisor.recipe @@ -32,56 +32,16 @@ class pcAdvisor(BasicNewsRecipe): remove_javascript = True no_stylesheets = True - - keep_only_tags = [ - dict(name='div', attrs={'id': 'articlecontent'}) - ] - - remove_tags = [ - dict(name='div', attrs={'id': ['crosssitesignup', 'submitarticle', 'dontPrint', - 'commentsForm', 'userReviewFormContainer', 'reevooContainerId']}), - dict(name='div', attrs={'class': 'mpu'}), - dict(name='p', attrs={'id': 'articlePageList'}), - dict(name='div', attrs={ - 'style': ['margin: 0pt 10px 5px;', 'margin: 0pt 10px 5px;']}), - dict(name='p', attrs={'class': 'dontPrint'}), - dict(name='h2', attrs={'class': 'sectionTitle'}), - dict(name='a', attrs={'title': 'Subscribe to PC Advisor'}), - dict(name='a', attrs={'name': 'revooContent'}), - {'name': ['form', 'script', 'link']} - ] - - remove_tags_after = [ - dict(name='p', attrs={'id': 'crosssitesignup'}) - ] - - def get_article_url(self, article): - return article.get('guid', None) + auto_cleanup = True feeds = [ - (u'News Headlines', u'http://www.pcadvisor.co.uk/rss/feeds/pcanews.xml'), - (u'Reviews', u'http://www.pcadvisor.co.uk/rss/feeds/pcareviews.xml'), - (u'New Products', - u'http://www.pcadvisor.co.uk/rss/feeds/blog18.xml'), - (u'PC Advisor Blog', - u'http://www.pcadvisor.co.uk/rss/feeds/blog4.xml'), - (u'PC Security', - u'http://www.pcadvisor.co.uk/rss/feeds/pca-security.xml'), - (u'Laptops', u'http://www.pcadvisor.co.uk/rss/feeds/pca-laptop.xml'), - (u'Green Computing', - u'http://www.pcadvisor.co.uk/rss/feeds/pca-green-computing.xml'), - (u'Internet and broadband', - u'http://www.pcadvisor.co.uk/rss/feeds/pca-internet.xml'), - (u'Prones and PDAs', - u'http://www.pcadvisor.co.uk/rss/feeds/pca-phones.xml'), - (u'Software', u'http://www.pcadvisor.co.uk/rss/feeds/pca-software.xml'), - (u'Small Business', - u'http://www.pcadvisor.co.uk/rss/feeds/pca-small-business.xml'), - (u'Photo and video', - u'http://www.pcadvisor.co.uk/rss/feeds/pca-photo-video.xml'), - (u'Mac News', u'http://www.pcadvisor.co.uk/rss/feeds/pca-mac.xml'), - (u'Linux', u'http://www.pcadvisor.co.uk/rss/feeds/pca-linux.xml'), - (u'WiFi and Networking', - u'http://www.pcadvisor.co.uk/rss/feeds/pca-networking.xml'), - (u'Gadgets', u'http://www.pcadvisor.co.uk/rss/feeds/pca-gadgets.xml') + (u'Latest', u'http://www.pcadvisor.co.uk/latest/rss'), + (u'News', u'http://www.pcadvisor.co.uk/news/rss'), + (u'How-tos', u'http://www.pcadvisor.co.uk/how-to/rss'), + (u'Reviews', u'http://www.pcadvisor.co.uk/review/rss'), + (u'Video Content', u'http://www.pcadvisor.co.uk/video/rss'), + (u'iPhone', u'http://www.pcadvisor.co.uk/latest/iphone/rss'), + (u'iPad', u'http://www.pcadvisor.co.uk/latest/ipad/rss'), + (u'Mac', u'http://www.pcadvisor.co.uk/latest/mac/rss'), + (u'Apple', u'http://www.pcadvisor.co.uk/latest/apple/rss'), ] diff --git a/recipes/phys_org.recipe b/recipes/phys_org.recipe index c8a8769a81..c175019b0c 100644 --- a/recipes/phys_org.recipe +++ b/recipes/phys_org.recipe @@ -19,8 +19,6 @@ class HindustanTimes(BasicNewsRecipe): 'http://phys.org/rss-feed/physics-news/'), ('Space and Earth', 'http://phys.org/rss-feed/space-news/'), - ('Electronics', - 'http://phys.org/rss-feed/electronics-news/'), ('Chemistry', 'http://phys.org/rss-feed/chemistry-news/'), ('Biology', diff --git a/recipes/politiken_dk.recipe b/recipes/politiken_dk.recipe index 9e08fd8d92..eb4ccd9156 100644 --- a/recipes/politiken_dk.recipe +++ b/recipes/politiken_dk.recipe @@ -22,26 +22,28 @@ class Politiken_dk(BasicNewsRecipe): encoding = 'cp1252' language = 'da' - extra_css = ' body{font-family: Arial,Helvetica,sans-serif } h1{font-family: Georgia,"Times New Roman",Times,serif } ' + extra_css = (' body{font-family: Arial,Helvetica,sans-serif } ' + 'h1{font-family: Georgia,"Times New Roman",Times,serif } ') conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language + 'comment': description, + 'tags': category, + 'publisher': publisher, + 'language': language } feeds = [ - - (u'Tophistorier', u'http://politiken.dk/rss/tophistorier.rss'), - (u'Seneste nyt', u'http://politiken.dk/rss/senestenyt.rss'), - (u'Mest laeste', u'http://politiken.dk/rss/mestlaeste.rss'), - (u'Danmark', u'http://politiken.dk/rss/indland.rss'), - (u'Politik', u'http://politiken.dk/rss/politik.rss'), - (u'Klima', u'http://politiken.dk/rss/klima.rss'), - (u'Internationalt', u'http://politiken.dk/rss/udland.rss'), - (u'Erhverv', u'http://politiken.dk/rss/erhverv.rss'), - (u'Kultur', u'http://politiken.dk/rss/kultur.rss'), - (u'Sport', u'http://politiken.dk/rss/sport.rss'), - (u'Uddannelse', u'http://politiken.dk/rss/uddannelse.rss'), - (u'Videnskab', u'http://politiken.dk/rss/videnskab.rss') + (u'Tophistorier', u'http://politiken.dk/rss/tophistorier.rss'), + (u'Seneste nyt', u'http://politiken.dk/rss/senestenyt.rss'), + (u'Mest laeste', u'http://politiken.dk/rss/mestlaeste.rss'), + (u'Danmark', u'http://politiken.dk/rss/indland.rss'), + (u'Politik', u'http://politiken.dk/rss/politik.rss'), + (u'Klima', u'http://politiken.dk/rss/klima.rss'), + (u'Internationalt', u'http://politiken.dk/rss/udland.rss'), + (u'Erhverv', u'http://politiken.dk/rss/erhverv.rss'), + (u'Kultur', u'http://politiken.dk/rss/kultur.rss'), + (u'Sport', u'http://politiken.dk/rss/sport.rss'), + (u'Uddannelse', u'http://politiken.dk/rss/uddannelse.rss'), ] remove_tags_before = dict(name='h1') remove_tags = [ diff --git a/recipes/rollingstone.recipe b/recipes/rollingstone.recipe index 33af764374..8a5e1d7593 100644 --- a/recipes/rollingstone.recipe +++ b/recipes/rollingstone.recipe @@ -4,7 +4,6 @@ __copyright__ = '2010, Darko Miletic ' rollingstone.com ''' -import re from calibre.web.feeds.news import BasicNewsRecipe @@ -22,44 +21,8 @@ class RollingStone(BasicNewsRecipe): language = 'en' remove_empty_feeds = True publication_type = 'magazine' - masthead_url = 'http://www.rollingstone.com/templates/rolling-stone-templates/theme/rstheme/images/rsLogo.png' - extra_css = """ - body{font-family: Georgia,Times,serif } - img{margin-bottom: 0.4em; display:block} - """ - - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language - } - - preprocess_regexps = [ - (re.compile(r'xml:lang="en">.*?', re.DOTALL | re.IGNORECASE), lambda match: 'xml:lang="en">\n\n'), (re.compile( - r'.*?', re.DOTALL | re.IGNORECASE), lambda match: '\n\n') - ] - - keep_only_tags = [ - dict(attrs={'class': ['headerImgHolder', 'headerContent']}), dict(name='div', attrs={'id': [ - 'teaser', 'storyTextContainer']}), dict(name='div', attrs={'class': 'blogDetailModule clearfix'}) - ] - - remove_tags = [ - dict(name=['meta', 'iframe', 'object', 'embed']), dict( - attrs={'id': 'mpStoryHeader'}), dict(attrs={'class': 'relatedTopics'}) - ] - remove_attributes = ['lang', 'onclick', 'width', 'height', 'name'] - remove_tags_before = dict(attrs={'class': 'bloggerInfo'}) - remove_tags_after = dict(attrs={'class': 'relatedTopics'}) + auto_cleanup = True feeds = [ - - (u'All News', u'http://www.rollingstone.com/siteServices/rss/allNews'), - (u'All Blogs', u'http://www.rollingstone.com/siteServices/rss/allBlogs'), - (u'Movie Reviews', u'http://www.rollingstone.com/siteServices/rss/movieReviews'), - (u'Album Reviews', u'http://www.rollingstone.com/siteServices/rss/albumReviews'), - (u'Song Reviews', u'http://www.rollingstone.com/siteServices/rss/songReviews') + (u'All News', u'http://www.rollingstone.com/siteServices/rss/allNews'), ] - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - return soup diff --git a/recipes/rstones.recipe b/recipes/rstones.recipe deleted file mode 100644 index e507c378d7..0000000000 --- a/recipes/rstones.recipe +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python2 -__license__ = 'GPL v3' -__author__ = 'Tony Stegall' -__copyright__ = '2010, Tony Stegall or Tonythebookworm on mobileread.com' -__version__ = 'v1.01' -__date__ = '07, October 2010' -__description__ = 'Rolling Stones Mag' - -''' -http://www.rollingstone.com -''' - -from calibre.web.feeds.news import BasicNewsRecipe - - -class RollingStones(BasicNewsRecipe): - __author__ = 'Tony Stegall' - description = 'Rolling Stones Mag' - cover_url = 'http://gallery.celebritypro.com/data/media/648/kid-rock-rolling-stone-cover.jpg' - masthead_url = 'http://origin.myfonts.com/s/ec/cc-200804/Rolling_Stone-logo.gif' - - title = 'Rolling Stones Mag' - category = 'Music Reviews, Movie Reviews, entertainment news' - - language = 'en' - timefmt = '[%a, %d %b, %Y]' - - oldest_article = 15 - max_articles_per_feed = 25 - use_embedded_content = False - no_stylesheets = True - auto_cleanup = True - - feeds = [ - (u'News', u'http://www.rollingstone.com/siteServices/rss/allNews'), - (u'Blogs', u'http://www.rollingstone.com/siteServices/rss/allBlogs'), - (u'Movie Reviews', u'http://www.rollingstone.com/siteServices/rss/movieReviews'), - (u'Album Reviews', u'http://www.rollingstone.com/siteServices/rss/albumReviews'), - (u'Song Reviews', u'http://www.rollingstone.com/siteServices/rss/songReviews'), - - - ] - - def print_version(self, url): - return url + '?print=true' diff --git a/recipes/sfbg.recipe b/recipes/sfbg.recipe index a0b24cd72c..cce5aa6af0 100644 --- a/recipes/sfbg.recipe +++ b/recipes/sfbg.recipe @@ -14,6 +14,5 @@ class SanFranciscoBayGuardian(BasicNewsRecipe): ] feeds = [ - ('sfbg', 'http://www.sfbg.com/rss.xml'), + ('sfbg', 'http://www.sfbg.com/feed/'), ] - diff --git a/recipes/shacknews.recipe b/recipes/shacknews.recipe index f21fcf8e47..acf6043496 100644 --- a/recipes/shacknews.recipe +++ b/recipes/shacknews.recipe @@ -9,23 +9,9 @@ class Shacknews(BasicNewsRecipe): oldest_article = 7 max_articles_per_feed = 100 language = 'en' - no_stylesheets = True - remove_tags = [dict(name='div', attrs={'class': ['nuggets', 'comments']}), - dict(name='p', attrs={'class': 'videoembed'})] - keep_only_tags = [dict(name='div', attrs={'class': 'story'})] + auto_cleanup = True + feeds = [ - (u'Latest News', u'http://feed.shacknews.com/shackfeed.xml'), - (u'PC', u'http://feed.shacknews.com/extras/tag_rss.x/PC'), - (u'Wii', u'http://feed.shacknews.com/extras/tag_rss.x/Nintendo+Wii'), - (u'Xbox 360', u'http://feed.shacknews.com/extras/tag_rss.x/Xbox+360'), - (u'Playstation 3', - u'http://feed.shacknews.com/extras/tag_rss.x/PlayStation+3'), - (u'PSP', u'http://feed.shacknews.com/extras/tag_rss.x/PSP'), - (u'Nintendo DS', u'http://feed.shacknews.com/extras/tag_rss.x/Nintendo+DS'), - (u'iPhone', u'http://feed.shacknews.com/extras/tag_rss.x/iPhone'), - (u'DLC', u'http://feed.shacknews.com/extras/tag_rss.x/DLC'), - (u'Valve', u'http://feed.shacknews.com/extras/tag_rss.x/Valve'), - (u'Electronic Arts', - u'http://feed.shacknews.com/extras/tag_rss.x/Electronic+Arts') + (u'Latest News', u'http://www.shacknews.com/shackfeed.xml'), ] diff --git a/recipes/staradvertiser.recipe b/recipes/staradvertiser.recipe index 2936a84fe9..91e285d8e6 100644 --- a/recipes/staradvertiser.recipe +++ b/recipes/staradvertiser.recipe @@ -14,69 +14,19 @@ class Starbulletin(BasicNewsRecipe): publisher = 'Honolulu Star-Advertiser' category = 'news, Honolulu, Hawaii' oldest_article = 2 - needs_subscription = True max_articles_per_feed = 100 language = 'en' no_stylesheets = True use_embedded_content = False encoding = 'utf8' publication_type = 'newspaper' - masthead_url = 'http://media.staradvertiser.com/designimages/star-advertiser-logo-small.gif' -# extra_css = """ -# body{font-family: Verdana,Arial,Helvetica,sans-serif} -# h1,.brown,.hsa_postCredit{color: #663300} -# .storyDeck{font-size: 1.2em; font-weight: bold} -# img{display: block} -# """ - - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True - } - keep_only_tags = [ - dict(attrs={'id': 'hsa_storyTitle'}), dict(attrs={'id': 'hsa_storyTitle article-important'}), dict(attrs={'class': ['hsa_dateStamp', 'hsa_postCredit', 'storyDeck']}), dict(name='span', attrs={'class': ['hsa_dateStamp', 'hsa_postCredit']}), dict(name='span', attrs={'class': ['hsa_dateStamp article-important', 'hsa_postCredit article-important']}), dict(name='div', attrs={'class': 'storytext article-important'}), dict(name='div', attrs={'class': 'storytext'}) # noqa - ] - remove_tags = [ - # removed 'span' from preceding list to permit keeping of author and - dict(name=['object', 'link', 'script', 'meta', - 'base', 'iframe']) # timestamp - , dict(attrs={'class': ['insideStoryImage', 'insideStoryAd']}), dict(attrs={'name': 'fb_share'}) - ] - - def get_browser(self): - br = BasicNewsRecipe.get_browser(self) - if self.username is not None and self.password is not None: - br.open('http://www.staradvertiser.com/manage/Login/') - br.select_form(name='loginForm') - br['email'] = self.username - br['password'] = self.password - br.submit() - return br + auto_cleanup = True feeds = [ - - (u'Breaking News', u'http://www.staradvertiser.com/news/breaking/index.rss'), - (u'News', u'http://www.staradvertiser.com/newspremium/index.rss'), - (u'Business', u'http://www.staradvertiser.com/businesspremium/index.rss'), - (u'Sports', u'http://www.staradvertiser.com/sportspremium/index.rss'), - (u'Features', u'http://www.staradvertiser.com/featurespremium/index.rss') + (u'Breaking News', + u'http://www.staradvertiser.com/category/breaking-news/feed/'), + (u'Business', u'http://www.staradvertiser.com/business/feed/'), + (u'Sports', u'http://www.staradvertiser.com/sports/feed/'), + (u'Features', + u'http://www.staradvertiser.com/featurespremium/index.rss') ] - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll('a'): - limg = item.find('img') - if item.string is not None: - str = item.string - item.replaceWith(str) - else: - if limg: - item.name = 'div' - item.attrs = [] - else: - str = self.tag_to_string(item) - item.replaceWith(str) - for item in soup.findAll('img'): - if not item.has_key('alt'): # noqa - item['alt'] = 'image' - return soup diff --git a/recipes/tech_world.recipe b/recipes/tech_world.recipe index a262ad6316..c6af3ffe02 100644 --- a/recipes/tech_world.recipe +++ b/recipes/tech_world.recipe @@ -10,17 +10,16 @@ http://www.techworld.com/ ''' from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ptempfile import PersistentTemporaryFile class techworld(BasicNewsRecipe): __author__ = 'Lorenzo Vigentini' description = 'Techworld offers the latest breaking IT industry news, product reviews, enterprise software downloads, how-to articles and expert blogs for technical professionals and enterprise users in the UK' # noqa - cover_url = 'http://www.techworld.com/graphics/header/site_logo.jpg' title = 'TechWorld' publisher = 'IDG Communication' - category = 'Apple, Mac, video, computing, product reviews, editing, cameras, production' + category = ('Apple, Mac, video, computing, product reviews, ' + 'editing, cameras, production') language = 'en' timefmt = '[%a, %d %b, %Y]' @@ -32,60 +31,16 @@ class techworld(BasicNewsRecipe): remove_javascript = True no_stylesheets = True - - temp_files = [] - articles_are_obfuscated = True - - def get_obfuscated_article(self, url): - br = self.get_browser() - br.open(url) - response = br.follow_link(url_regex='?getDynamicPage&print$', nr=0) - html = response.read() - self.temp_files.append(PersistentTemporaryFile('_fa.html')) - self.temp_files[-1].write(html) - self.temp_files[-1].close() - return self.temp_files[-1].name - - keep_only_tags = [ - dict(name='div', attrs={'id': 'articleBody'}), - dict(name='h2', attrs={'class': 'blogTitle'}), - dict(name='h3', attrs={'class': 'blogger'}), - ] - - remove_tags = [ - dict(name='div', attrs={'class': ['submissionBar', 'mpuContainer']}), - dict(name='div', attrs={'id': [ - 'breadcrumb', 'mainContentSidebar', 'articleIconsList', 'loginSubscribeBoxout']}), - dict(name='ul', attrs={'class': 'articleIconsList'}) - ] - remove_tags_after = [ - dict(name='div', attrs={'id': 'articleFooter'}) - ] + auto_cleanup = True feeds = [ - (u'News', u'http://www.techworld.com/rss/feeds/techworld-news.xml'), - (u'How-Tos', u'http://www.techworld.com/rss/feeds/techworld-how-tos.xml'), - (u'Reviews', u'http://www.techworld.com/rss/feeds/techworld-reviews.xml'), - (u'Features', u'http://www.techworld.com/rss/feeds/techworld-features.xml'), - (u'Storage', u'http://www.techworld.com/rss/feeds/techworld-storage.xml'), - (u'Applications', - u'http://www.techworld.com/rss/feeds/techworld-applications.xml'), - (u'Virtualization', - u'http://www.techworld.com/rss/feeds/techworld-virtualisation.xml'), - (u'Personal Tech', - u'http://www.techworld.com/rss/feeds/techworld-personal-tech.xml'), - (u'Green IT', u'http://www.techworld.com/rss/feeds/techworld-green-it.xml'), - (u'Security', u'http://www.techworld.com/rss/feeds/techworld-security.xml'), - (u'Operating Systems', - u'http://www.techworld.com/rss/feeds/techworld-operating-systems.xml'), - (u'Networking', u'http://www.techworld.com/rss/feeds/techworld-networking.xml'), - (u'Mobile and Wireless', - u'http://www.techworld.com/rss/feeds/techworld-mobile-wireless.xml'), - (u'Data Centre', u'http://www.techworld.com/rss/feeds/techworld-data-centre.xml'), - (u'SME', u'http://www.techworld.com/rss/feeds/techworld-sme.xml'), - (u'TechWorld Blogs', u'http://blogs.techworld.com/atom.xml') + (u'News', u'http://www.techworld.com/news/rss'), + (u'Tutorial', u'http://www.techworld.com/tutorial/rss'), + (u'Reviews', u'http://www.techworld.com/review/rss'), + (u'Features', u'http://www.techworld.com/features/rss'), + (u'Analysis', u'http://www.techworld.com/analysis/rss'), + (u'Galleries', + u'http://www.techworld.com/picture-gallery/rss'), + (u'TechWorld Blogs', + u'http://www.techworld.com/blog/rss'), ] - - extra_css = ''' - img {align:left;} - ''' diff --git a/recipes/technology_review.recipe b/recipes/technology_review.recipe index 8a47652685..f475f9ac4c 100644 --- a/recipes/technology_review.recipe +++ b/recipes/technology_review.recipe @@ -18,12 +18,14 @@ class TechnologyReview(BasicNewsRecipe): .subheadline {font: italic large} """ feeds = [ - (u'Computing', u'http://feeds.technologyreview.com/technology_review_Computing'), - (u'Web', u'http://feeds.technologyreview.com/technology_review_Web'), - (u'Communications', - u'http://feeds.technologyreview.com/technology_review_Communications'), - (u'Energy', u'http://feeds.technologyreview.com/technology_review_Energy'), - (u'Materials', u'http://feeds.technologyreview.com/technology_review_Materials'), - (u'Biomedicine', u'http://feeds.technologyreview.com/technology_review_Biotech'), - (u'Business', u'http://feeds.technologyreview.com/technology_review_Biztech') + (u'Computing', + u'http://feeds.technologyreview.com/technology_review_Computing'), + (u'Energy', + u'http://feeds.technologyreview.com/technology_review_Energy'), + (u'Materials', + u'http://feeds.technologyreview.com/technology_review_Materials'), + (u'Biomedicine', + u'http://feeds.technologyreview.com/technology_review_Biotech'), + (u'Business', + u'http://feeds.technologyreview.com/technology_review_Biztech') ] diff --git a/recipes/the_budget_fashionista.recipe b/recipes/the_budget_fashionista.recipe index ef4b49bf61..40c32c8f26 100644 --- a/recipes/the_budget_fashionista.recipe +++ b/recipes/the_budget_fashionista.recipe @@ -22,20 +22,8 @@ class TheBudgetFashionista(BasicNewsRecipe): category = 'news, fashion, comsetics, women' lang = 'en-US' language = 'en' + auto_cleanup = True - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': lang - } - - keep_only_tags = [dict(name='div', attrs={'class': 'columnLeft'})] - remove_tags_after = dict(name='div', attrs={'class': 'postDetails'}) - remove_tags = [dict(name=['object', 'link', 'script', - 'iframe', 'form', 'login-button'])] - - feeds = [(u'Articles', u'http://www.thebudgetfashionista.com/feeds/atom/')] - - def preprocess_html(self, soup): - for it in soup.findAll('img'): - if it.parent.name == 'a': - it.parent.name = 'div' - return soup + feeds = [(u'Articles', + u'http://feeds.feedburner.com/TheBudgetFashionista') + ] diff --git a/recipes/the_week_magazine_free.recipe b/recipes/the_week_magazine_free.recipe index 3690a17270..87e9151b8b 100644 --- a/recipes/the_week_magazine_free.recipe +++ b/recipes/the_week_magazine_free.recipe @@ -23,8 +23,5 @@ class TheWeek(BasicNewsRecipe): language = 'en' auto_cleanup = True feeds = [ - (u'News-Opinion', u'http://theweek.com/section/index/news_opinion.rss'), - (u'Business', u'http://theweek.com/section/index/business.rss'), - (u'Arts-Life', u'http://theweek.com/section/index/arts_life.rss'), - (u'Cartoons', u'http://theweek.com/section/index/cartoon_wit/0/all-cartoons.rss') + (u'Latest articles', u'http://theweek.com/rss.xml'), ] diff --git a/recipes/usatoday.recipe b/recipes/usatoday.recipe index b86a8afb01..5cef8507cc 100644 --- a/recipes/usatoday.recipe +++ b/recipes/usatoday.recipe @@ -14,7 +14,6 @@ class USAToday(BasicNewsRecipe): title = 'USA Today' __author__ = 'Kovid Goyal' description = 'newspaper' - cover_url = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg12/lg/USAT.jpg' encoding = 'utf-8' publisher = 'usatoday.com' category = 'news, usa' @@ -28,25 +27,42 @@ class USAToday(BasicNewsRecipe): filterDuplicates = True extra_css = ''' - h1, h2 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;} - #post-attributes, .info, .clear {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;} - #post-body, #content {font-size:medium; font-family:Arial,Helvetica,sans-serif;} + h1, h2 { + font-size:xx-large; + font-family:Arial,Helvetica,sans-serif;} + #post-attributes, .info, + .clear { + font-size:xx-small; color:#4D4D4D; + font-family:Arial,Helvetica,sans-serif; + } + #post-body, + #content { + font-size:medium; + font-family:Arial,Helvetica,sans-serif; + } ''' feeds = [ - ('Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'), - ('Tech Headlines', 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'), - ('Personal Tech', 'http://rssfeeds.usatoday.com/UsatodaycomTech-PersonalTalk'), - ('Science', 'http://rssfeeds.usatoday.com/TP-ScienceFair'), - ('Health', 'http://rssfeeds.usatoday.com/UsatodaycomHealth-TopStories'), - ('Travel Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomTravel-TopStories'), - ('Money Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomMoney-TopStories'), + ('Top Headlines', + 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'), + ('Tech Headlines', + 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'), + ('Personal Tech', + 'http://rssfeeds.usatoday.com/UsatodaycomTech-PersonalTalk'), + ('Health', + 'http://rssfeeds.usatoday.com/UsatodaycomHealth-TopStories'), + ('Travel Headlines', + 'http://rssfeeds.usatoday.com/UsatodaycomTravel-TopStories'), + ('Money Headlines', + 'http://rssfeeds.usatoday.com/UsatodaycomMoney-TopStories'), ('Entertainment Headlines', 'http://rssfeeds.usatoday.com/usatoday-LifeTopStories'), - ('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'), - ('Weather Headlines', 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'), - ('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'), - ('Offbeat News', 'http://rssfeeds.usatoday.com/UsatodaycomOffbeat-TopStories') + ('Sport Headlines', + 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'), + ('Weather Headlines', + 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'), + ('Most Popular', + 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'), ] auto_cleanup = True diff --git a/recipes/usnews.recipe b/recipes/usnews.recipe index ab68c4bffc..a1943a955f 100644 --- a/recipes/usnews.recipe +++ b/recipes/usnews.recipe @@ -20,40 +20,14 @@ class LaPrensa(BasicNewsRecipe): use_embedded_content = False encoding = 'utf-8' language = 'en' - - html2lrf_options = [ - '--comment', description, '--category', category, '--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + \ - '"\ncomments="' + description + '"\ntags="' + category + '"' - - keep_only_tags = [ - dict(name='h1'), dict(name='div', attrs={'id': ['dateline']}), dict( - name='div', attrs={'class': ['blogCredit', 'body']}) - ] + auto_cleanup = True feeds = [ - - (u'Homepage', u'http://www.usnews.com/rss/usnews.rss'), - (u'Health', u'http://www.usnews.com/rss/health/index.rss'), - (u'Nation & World', u'http://www.usnews.com/rss/news/index.rss'), - (u'Money & Business', u'http://www.usnews.com/rss/business/index.rss'), - (u'Education', u'http://www.usnews.com/rss/education/index.rss'), - (u'Opinion', u'http://www.usnews.com/rss/opinion/index.rss'), - (u'Science', u'http://www.usnews.com/rss/science/index.rss') + (u'Homepage', u'http://www.usnews.com/rss/usnews.rss'), + (u'Health', u'http://www.usnews.com/rss/health'), + (u'Nation & World', u'http://www.usnews.com/rss/news'), + (u'Money & Business', u'http://www.usnews.com/rss/money'), + (u'Education', u'http://www.usnews.com/rss/education'), + (u'Opinion', u'http://www.usnews.com/rss/opinion'), + (u'Science', u'http://www.usnews.com/rss/science') ] - - def print_version(self, url): - return url.replace('.html', '_print.html') - - def get_article_url(self, article): - raw = article.get('link', None) - artcl, sep, unneeded = raw.rpartition('?') - return artcl - - def preprocess_html(self, soup): - del soup.body['onload'] - for item in soup.findAll(style=True): - del item['style'] - return soup diff --git a/recipes/waco_tribune.recipe b/recipes/waco_tribune.recipe index 922755df43..b14c7072e5 100644 --- a/recipes/waco_tribune.recipe +++ b/recipes/waco_tribune.recipe @@ -11,10 +11,10 @@ class AdvancedUserRecipe1278773519(BasicNewsRecipe): max_articles_per_feed = 100 feeds = [ - (u'News', u'http://www.wacotrib.com/news/index.rss2'), - (u'Sports', u'http://www.wacotrib.com/sports/index.rss2'), - (u'AccessWaco', u'http://www.wacotrib.com/accesswaco/index.rss2'), - (u'Opinions', u'http://www.wacotrib.com/opinion/index.rss2') + (u'News', u'http://www.wacotrib.com/search/?q=&t=article&l=100&d=&d1=&d2=&s=start_time&sd=desc&nsa=eedition&c[]=news/ap_nation,news/ap_nation/*&f=rss'), + (u'Sports', u'http://www.wacotrib.com/search/?q=&t=article&l=25&d=&d1=&d2=&s=start_time&sd=desc&c[]=sports*&f=rss'), + (u'AccessWaco', u'http://www.wacotrib.com/search/?q=&t=article&l=25&d=&d1=&d2=&s=start_time&sd=desc&c[]=entertainment/accesswaco*&f=rss'), + (u'Opinions', u'http://www.wacotrib.com/search/?q=&t=article&l=25&d=&d1=&d2=&s=start_time&sd=desc&c[]=opinion*&f=rss') ] remove_javascript = True @@ -23,13 +23,4 @@ class AdvancedUserRecipe1278773519(BasicNewsRecipe): language = 'en' encoding = 'utf-8' conversion_options = {'linearize_tables': True} - masthead_url = 'http://media.wacotrib.com/designimages/wacotrib_logo.jpg' - keep_only_tags = [ - dict(name='div', attrs={'class': 'twoColumn left'}), - ] - remove_tags = [ - dict(name='div', attrs={'class': 'right blueLinks'}), - ] - remove_tags_after = [ - dict(name='div', attrs={'class': 'dottedRule'}), - ] + auto_cleanup = True diff --git a/recipes/wash_post.recipe b/recipes/wash_post.recipe index 4f667da20a..b1687236f1 100644 --- a/recipes/wash_post.recipe +++ b/recipes/wash_post.recipe @@ -4,7 +4,6 @@ __copyright__ = '2011, Darko Miletic ' www.washingtonpost.com ''' -from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe @@ -23,55 +22,18 @@ class TheWashingtonPost(BasicNewsRecipe): language = 'en' remove_empty_feeds = True publication_type = 'newspaper' - masthead_url = 'http://www.washingtonpost.com/rw/sites/twpweb/img/logos/twp_logo_300.gif' - cover_url = strftime( - 'http://www.washingtonpost.com/rw/WashingtonPost/Content/Epaper/%Y-%m-%d/Ax1.pdf') - extra_css = """ - body{font-family: Georgia,serif } - """ - - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language - } - - keep_only_tags = [ - dict(attrs={'id': ['content', 'entryhead', 'entrytext']})] - remove_tags = [ - dict(name=['meta', 'link', 'iframe', 'base']), dict( - attrs={'id': 'multimedia-leaf-page'}) - ] - remove_attributes = ['lang', 'property', 'epochtime', - 'datetitle', 'pagetype', 'contenttype', 'comparetime'] + auto_cleanup = True feeds = [ - - (u'World', u'http://feeds.washingtonpost.com/rss/world'), - (u'National', u'http://feeds.washingtonpost.com/rss/national'), - (u'White House', u'http://feeds.washingtonpost.com/rss/politics/whitehouse'), - (u'Business', u'http://feeds.washingtonpost.com/rss/business'), - (u'Opinions', u'http://feeds.washingtonpost.com/rss/opinions'), - (u'Investigations', u'http://feeds.washingtonpost.com/rss/investigations'), - (u'Local', u'http://feeds.washingtonpost.com/rss/local'), - (u'Entertainment', u'http://feeds.washingtonpost.com/rss/entertainment'), - (u'Sports', u'http://feeds.washingtonpost.com/rss/sports'), - (u'Redskins', u'http://feeds.washingtonpost.com/rss/sports/redskins'), - (u'Special Reports', u'http://feeds.washingtonpost.com/rss/national/special-reports') + (u'World', u'http://feeds.washingtonpost.com/rss/world'), + (u'National', u'http://feeds.washingtonpost.com/rss/national'), + (u'White House', + u'http://feeds.washingtonpost.com/rss/politics/whitehouse'), + (u'Business', u'http://feeds.washingtonpost.com/rss/business'), + (u'Opinions', u'http://feeds.washingtonpost.com/rss/opinions'), + (u'Local', u'http://feeds.washingtonpost.com/rss/local'), + (u'Entertainment', + u'http://feeds.washingtonpost.com/rss/entertainment'), + (u'Sports', u'http://feeds.washingtonpost.com/rss/sports'), + (u'Redskins', u'http://feeds.washingtonpost.com/rss/sports/redskins'), ] - - def print_version(self, url): - if '_story.html' in url: - return url.replace('_story.html', '_print.html') - return url - - def get_article_url(self, article): - link = BasicNewsRecipe.get_article_url(self, article) - if article.id.startswith('http'): - link = article.id - if 'washingtonpost.com' not in link: - self.log('Skipping ads:', link) - return None - for it in ['_video.html', '_gallery.html', '_links.html']: - if it in link: - self.log('Skipping non-article:', link) - return None - return link diff --git a/recipes/worldcrunch.recipe b/recipes/worldcrunch.recipe index d0e511c103..90b8bdac16 100644 --- a/recipes/worldcrunch.recipe +++ b/recipes/worldcrunch.recipe @@ -14,5 +14,5 @@ class Worldcrunch(BasicNewsRecipe): feeds = [ ('News', - 'http://www.worldcrunch.com/feed'), + 'http://www.worldcrunch.com/rss/rss.php'), ]