diff --git a/recipes/baltimore_sun.recipe b/recipes/baltimore_sun.recipe
index c49349289c..062abe3530 100644
--- a/recipes/baltimore_sun.recipe
+++ b/recipes/baltimore_sun.recipe
@@ -24,65 +24,39 @@ class BaltimoreSun(BasicNewsRecipe):
no_stylesheets = True
remove_javascript = True
remove_empty_feeds = True
+ auto_cleanup = False
ignore_duplicate_articles = {'title'}
keep_only_tags = [
- dict(name=['div', 'section'], attrs={'class': [
- "trb_article_title", "trb_article_leadart", 'trb_bylines', 'trb_article_dateline', 'trb_mainContent']}),
+ dict(name=['div'], attrs={'class': ['trb_ar_hl', 'trb_ar_hl_t',
+ 'trb_ar_la', 'trb_ar_by',
+ 'trb_ar_dateline', 'trb_ar_page']}),
]
remove_tags = [
dict(name=['meta', 'link']),
- dict(name=['div', 'aside'], attrs={'class': lambda x: x and set(x.split()).intersection({
- 'trb_gptAd', 'trb_panelmod_container', 'trb_socialize', 'trb_taboola', 'trb_embed_related'})}),
]
- def preprocess_html(self, soup):
- for img in soup.findAll('img'):
- img['src'] = img['data-baseurl']
- return soup
-
feeds = [
# News ##
(u'Top Headlines', u'http://feeds.feedburner.com/baltimoresun/news/rss2'),
(u'Breaking News', u'http://feeds.feedburner.com/baltimoresun/news/local/annearundel/rss2'),
(u'Top Maryland', u'http://feeds.feedburner.com/baltimoresun/news/local/rss2'),
- # (u'Anne Arundel County', u'http://feeds.feedburner.com/baltimoresun/news/local/annearundel/rss2'),
(u'Baltimore City', u'http://feeds.feedburner.com/baltimoresun/news/local/baltimore_city/rss20xml'),
- # (u'Baltimore County', u'http://feeds.feedburner.com/baltimoresun/news/local/baltimore_county/rss2'),
- # (u'Carroll County', u'http://feeds.feedburner.com/baltimoresun/news/local/carroll/rss2'),
- # (u'Harford County', u'http://feeds.feedburner.com/baltimoresun/news/local/harford/rss2),
- # (u'Howard County', u'http://feeds.feedburner.com/baltimoresun/news/local/howard/rss2'),
(u'Education', u'http://feeds.feedburner.com/baltimoresun/news/education/rss2'),
- # (u'Obituaries', u'http://feeds.feedburner.com/baltimoresun/news/obituaries/rss2'),
- (u'Local Politics',
- u'http://feeds.feedburner.com/baltimoresun/news/local/politics/rss2'),
+ (u'Local Politics', u'http://feeds.feedburner.com/baltimoresun/news/local/politics/rss2'),
(u'Weather', u'http://feeds.feedburner.com/baltimoresun/news/weather/site/rss2'),
- # (u'Traffic', u'http://feeds.feedburner.com/baltimoresun/news/traffic/rss2'),
(u'Nation/world', u'http://feeds.feedburner.com/baltimoresun/news/nationworld/rss2'),
- # (u'Weird News', u'http://feeds.feedburner.com/baltsun-weirdnews'),
# Sports##
(u'Top Sports', u'http://feeds.feedburner.com/baltimoresun/sports/rss2'),
(u'Orioles/Baseball', u'http://www.baltimoresun.com/sports/orioles/rss2.0.xml'),
- (u'Ravens/Football',
- u'http://feeds.feedburner.com/baltimoresun/sports/football/rss2'),
- # (u'Terps', u''http://feeds.feedburner.com/baltimoresun/sports/terps/rss2'),
- # (u'College Football', u''feed://feeds.feedburner.com/baltimoresun/sports/college/football/rss2'),
- # (u'Lacrosse', u'http://feeds.feedburner.com/baltimoresun/sports/college/lacrosse/rss2'),
- # (u'Horse Racing', u'http://feeds.feedburner.com/baltimoresun/sports/horseracing/rss2'),
- # (u'Golf', u'http://feeds.feedburner.com/baltimoresun/sports/golf/rss2'),
- # (u'NBA', u'http://feeds.feedburner.com/baltimoresun/sports/basketball/rss2'),
- # (u'High School', u'http://feeds.feedburner.com/baltimoresun/sports/highschool/rss2'),
- # (u'Outdoors', u'http://feeds.feedburner.com/baltimoresun/sports/outdoors/rss2'),
+ (u'Ravens/Football', u'http://feeds.feedburner.com/baltimoresun/sports/football/rss2'),
# Entertainment ##
- (u'Celebrity News', u'http://baltimore.feedsportal.com/c/34255/f/623042/index.rss'),
(u'Arts & Theater', u'http://feeds.feedburner.com/baltimoresun/entertainment/galleriesmuseums/rss2'),
(u'Movies', u'http://www.baltimoresun.com/entertainment/movies/rss2.0.xml'),
- (u'Music & Nightlife',
- u'http://www.baltimoresun.com/entertainment/music/rss2.0.xml'),
- (u'Restaurants & Food',
- u'http://www.baltimoresun.com/entertainment/dining/rss2.0.xml'),
+ (u'Music & Nightlife', u'http://www.baltimoresun.com/entertainment/music/rss2.0.xml'),
+ (u'Restaurants & Food', u'http://www.baltimoresun.com/entertainment/dining/rss2.0.xml'),
(u'TV/Media', u'http://www.baltimoresun.com/entertainment/tv/rss2.0.xml'),
# Life ##
@@ -91,104 +65,42 @@ class BaltimoreSun(BasicNewsRecipe):
(u'Living Green', u'http://www.baltimoresun.com/features/green/rss2.0.xml'),
(u'Fashion', u'http://www.baltimoresun.com/features/fashion/rss2.0.xml'),
(u'Travel', u'http://www.baltimoresun.com/travel/rss2.0.xml'),
- # (u'Faith', u'http://www.baltimoresun.com/features/faith/rss2.0.xml'),
# Business ##
(u'Top Business', u'http://www.baltimoresun.com/business/rss2.0.xml'),
(u'Technology', u'http://www.baltimoresun.com/business/technology/rss2.0.xml'),
- (u'Personal finance', u'http://baltimore.feedsportal.com/c/34255/f/623057/index.rss'),
(u'Real Estate', u'http://www.baltimoresun.com/classified/realestate/rss2.0.xml'),
- (u'Jobs', u'http://baltimore.feedsportal.com/c/34255/f/623059/index.rss'),
- # (u'DIY', u'http://baltimore.feedsportal.com/c/34255/f/623060/index.rss'),
- # (u'Consumer Safety', u'http://baltimore.feedsportal.com/c/34255/f/623061/index.rss'),
(u'Investing', u'http://www.baltimoresun.com/business/money/rss2.0.xml'),
# Opinion##
(u'Sun Editorials', u'http://www.baltimoresun.com/news/opinion/editorial/rss2.0.xml'),
(u'Op/Ed', u'http://www.baltimoresun.com/news/opinion/oped/rss2.0.xml'),
- (u'Readers Respond', u'http://baltimore.feedsportal.com/c/34255/f/623065/index.rss'),
-
- # Columnists ##
- (u'Kevin Cowherd', u'http://www.baltimoresun.com/sports/bal-columnist-cowherd,0,6829726.columnist-rss2.0.xml'),
- (u'Robert Ehrlich', u'http://www.baltimoresun.com/news/opinion/columnists/bal-columnist-ehrlich,0,1825227.columnist-rss2.0.xml'),
- (u'Jacques Kelly', u'http://www.baltimoresun.com/news/maryland/bal-columnist-kelly,0,1154701.columnist-rss2.0.xml'),
- (u'Marta H. Mossburg', u'http://www.baltimoresun.com/news/opinion/oped/bal-columnist-mossburg,0,7982155.columnist-rss2.0.xml'),
- (u'Mike Preston', u'http://www.baltimoresun.com/sports/bal-columnist-preston,0,6169796.columnist-rss2.0.xml'),
- (u'Susan Reimer', u'http://www.baltimoresun.com/news/opinion/bal-columnist-reimer,0,162466.columnist-rss2.0.xml'),
- (u'Dan Rodricks', u'http://www.baltimoresun.com/news/maryland/bal-columnist-rodricks,0,7089843.columnist-rss2.0.xml'),
- (u'Thomas F. Schaller', u'http://www.baltimoresun.com/news/opinion/columnists/bal-columnist-schaller,0,897397.columnist-rss2.0.xml'),
- (u'Peter Schmuck', u'http://www.baltimoresun.com/sports/bal-columnist-schmuck,0,7485088.columnist-rss2.0.xml'),
# News Blogs ##
- (u'Baltimore Crime Beat',
- u'http://baltimore.feedsportal.com/c/34255/f/623075/index.rss'),
(u'InsideEd', u'http://www.baltimoresun.com/news/maryland/education/blog/rss2.0.xml'),
- (u'Maryland Politics',
- u'http://www.baltimoresun.com/news/maryland/politics/blog/rss2.0.xml'),
- (u'Maryland Weather',
- u'http://www.baltimoresun.com/news/weather/weather-blog/rss2.0.xml'),
- (u'Second Opinion',
- u'http://www.baltimoresun.com/news/opinion/second-opinion-blog/rss2.0.xml'),
- (u'Sun Investigates',
- u'http://www.baltimoresun.com/news/maryland/sun-investigates/rss2.0.xml'),
+ (u'Maryland Politics', u'http://www.baltimoresun.com/news/maryland/politics/blog/rss2.0.xml'),
+ (u'Maryland Weather', u'http://www.baltimoresun.com/news/weather/weather-blog/rss2.0.xml'),
+ (u'Second Opinion', u'http://www.baltimoresun.com/news/opinion/second-opinion-blog/rss2.0.xml'),
+ (u'Sun Investigates', u'http://www.baltimoresun.com/news/maryland/sun-investigates/rss2.0.xml'),
(u'You Dont Say', u'http://www.baltimoresun.com/news/language-blog/rss2.0.xml'),
# Business Blogs ##
(u'BaltTech', u'http://www.baltimoresun.com/business/technology/blog/rss2.0.xml'),
- (u'Consuming Interests',
- u'http://www.baltimoresun.com/business/consuming-interests-blog/rss2.0.xml'),
- (u'The Real Estate Wonk',
- u'http://www.baltimoresun.com/business/real-estate/wonk/rss2.0.xml'),
+ (u'Consuming Interests', u'http://www.baltimoresun.com/business/consuming-interests-blog/rss2.0.xml'),
+ (u'The Real Estate Wonk', u'http://www.baltimoresun.com/business/real-estate/wonk/rss2.0.xml'),
# Entertainment Blogs ##
(u'ArtSmash', 'http://www.baltimoresun.com/entertainment/arts/artsmash/rss2.0.xml'),
- (u'Baltimore Diner', u'http://baltimore.feedsportal.com/c/34255/f/623088/index.rss'),
(u'Midnight Sun', u'http://www.baltimoresun.com/entertainment/music/midnight-sun-blog/rss2.0.xml'),
(u'Read Street', u'http://www.baltimoresun.com/features/books/read-street/rss2.0.xml'),
(u'Z on TV', u'http://www.baltimoresun.com/entertainment/tv/z-on-tv-blog/rss2.0.xml'),
# Life Blogs ##
- # (u'BMore Green', u'http://weblogs.baltimoresun.com/features/green/index.xml'),
- (u'Baltimore Insider',
- u'http://www.baltimoresun.com/features/baltimore-insider-blog/rss2.0.xml'),
+ (u'Baltimore Insider', u'http://www.baltimoresun.com/features/baltimore-insider-blog/rss2.0.xml'),
(u'Picture of Health', u'http://www.baltimoresun.com/health/blog/rss2.0.xml'),
- # (u'Unleashed', u'http://weblogs.baltimoresun.com/features/mutts/blog/index.xml'),
-
- # b the site blogs ##
- (u'TV Lust', u'http://baltimore.feedsportal.com/c/34255/f/623096/index.rss'),
# Sports Blogs ##
- (u'Baltimore Sports Blitz',
- u'http://baltimore.feedsportal.com/c/34255/f/623097/index.rss'),
- # (u'Lacrosse Insider',u'http://www.baltimoresun.com/sports/lacrosse-blog/rss2.0.xml'),
(u'Orioles Insider', u'http://baltimore.feedsportal.com/c/34255/f/623100/index.rss'),
- (u'Ravens Insider',
- u'http://www.baltimoresun.com/sports/ravens/ravens-insider/rss2.0.xml'),
- # (u'Ring Posts', u'http://weblogs.baltimoresun.com/sports/wrestling/blog/index.xml'),
- (u'The Schmuck Stops Here',
- u'http://www.baltimoresun.com/sports/schmuck-blog/rss2.0.xml'),
- # (u'Tracking the Terps', u'http://weblogs.baltimoresun.com/sports/college/maryland_terps/blog/index.xml'),
- # (u'Varsity Letters', u'http://weblogs.baltimoresun.com/sports/highschool/varsityletters/index.xml'),
+ (u'Ravens Insider', u'http://www.baltimoresun.com/sports/ravens/ravens-insider/rss2.0.xml'),
+ (u'The Schmuck Stops Here', u'http://www.baltimoresun.com/sports/schmuck-blog/rss2.0.xml'),
]
-
- def get_article_url(self, article):
- ans = None
- try:
- s = article.summary
- ans = urllib.unquote(
- re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
- except:
- pass
- if ans is None:
- ans = article.get('feedburner_origlink',
- article.get('guid', article.get('link')))
- if ans is not None:
- return ans.replace('?track=rss', '')
-
- def skip_ad_pages(self, soup):
- text = soup.find(text='click here to continue to article')
- if text:
- a = text.parent
- url = a.get('href')
- if url:
- return self.index_to_soup(url, raw=True)
diff --git a/recipes/dallas.recipe b/recipes/dallas.recipe
index f60cf2d546..1cc03a6490 100644
--- a/recipes/dallas.recipe
+++ b/recipes/dallas.recipe
@@ -12,20 +12,24 @@ class DallasNews(BasicNewsRecipe):
auto_cleanup = True
feeds = [
+ ('News',
+ 'http://www.dallasnews.com/news.rss'),
('Local News',
- 'http://www.dallasnews.com/news/politics/local-politics/?rss'),
- ('National Politics',
- 'http://www.dallasnews.com/news/politics/national-politic/?rss'),
+ 'http://www.dallasnews.com/news/local-politics.rss'),
('State Politics',
- 'http://www.dallasnews.com/news/politics/state-politics/?rss'),
+ 'http://www.dallasnews.com/news/texas-politics.rss'),
('Religion',
- 'http://www.dallasnews.com/news/religion/?rss'),
+ 'http://www.dallasnews.com/life/faith.rss'),
('Crime',
- 'http://www.dallasnews.com/news/crime/headlines/?rss'),
+ 'http://www.dallasnews.com/news/crime.rss'),
('Celebrity News',
'http://www.dallasnews.com/entertainment/celebrity-news/?rss&listname=TopStories'),
- ('Nation',
- 'http://www.dallasnews.com/news/nation-world/nation/?rss'),
- ('World',
- 'http://www.dallasnews.com/news/nation-world/world/?rss'),
+ ('Business',
+ 'http://www.dallasnews.com/business.rss'),
+ ('Arts',
+ 'http://www.dallasnews.com/arts.rss'),
+ ('Life',
+ 'http://www.dallasnews.com/life.rss'),
+ ('Opinion',
+ 'http://www.dallasnews.com/opinion.rss'),
]
diff --git a/recipes/digital_arts.recipe b/recipes/digital_arts.recipe
index 8d437ad1ca..75ebea907f 100644
--- a/recipes/digital_arts.recipe
+++ b/recipes/digital_arts.recipe
@@ -18,12 +18,15 @@ articles_are_obfuscated = True
class digiArts(BasicNewsRecipe):
__author__ = 'Lorenzo Vigentini'
- description = 'Digital Arts - comprehensive coverage of the art of graphic design, 3D, animation, video, effects, web and interactive design, in print and online.' # noqa
+ description = ('Digital Arts - comprehensive coverage of the art of '
+ 'graphic design, 3D, animation, video, effects, web and '
+ 'interactive design, in print and online.') # noqa
cover_url = 'http://media.digitalartsonline.co.uk/graphics/logo_digital_arts.gif'
title = 'Digital Arts Magazine '
publisher = 'IDG Communication'
- category = 'Multimedia, photo, video, computing, product reviews, editing, cameras, production'
+ category = ('Multimedia, photo, video, computing, product reviews, '
+ 'editing, cameras, production')
language = 'en'
encoding = 'cp1252'
@@ -36,30 +39,22 @@ class digiArts(BasicNewsRecipe):
remove_javascript = True
no_stylesheets = True
-
- def get_obfuscated_article(self, url):
- br = self.get_browser()
- br.open(url + '&print')
-
- response = br.follow_link(url, nr=0)
- html = response.read()
-
- self.temp_files.append(PersistentTemporaryFile('_fa.html'))
- self.temp_files[-1].write(html)
- self.temp_files[-1].close()
- return self.temp_files[-1].name
+ auto_cleanup = False
keep_only_tags = [
- dict(name='div', attrs={'id': ['articleHeader', 'articleContent']})
+ dict(name='h1', attrs={'itemprop': 'headline'}),
+ dict(name='span', attrs={'itemprop': 'author'}),
+ dict(name='section', attrs={'class': 'articleBody'}),
]
- remove_tags = [
- dict(name='div', attrs={'class': ['submissionBar', 'mpuContainer']}),
- dict(name='div', attrs={'id': ['articleSidebar', 'articleFooter']})
- ]
- remove_tags_after = [
- dict(name='p', attrs={'id': 'articlePageList'})
- ]
+ # Feed are found here: http://www.digitalartsonline.co.uk/rss/
feeds = [
- (u'Content', u'http://rss.feedsportal.com/c/662/f/8410/index.rss')
+ ('Latest News Articles',
+ 'http://www.digitalartsonline.co.uk/rss/feeds/digitalarts-news.xml'),
+ ('Latest Tutorials',
+ 'http://www.digitalartsonline.co.uk/rss/feeds/digitalarts-tutorials.xml'),
+ ('Latest Reviews',
+ 'http://www.digitalartsonline.co.uk/rss/feeds/digitalarts-reviews.xml'),
+ ('Latest Features',
+ 'http://www.digitalartsonline.co.uk/rss/feeds/digitalarts-features.xml'),
]
diff --git a/recipes/discover_magazine.recipe b/recipes/discover_magazine.recipe
index 92aa624b9f..16ab94fb0a 100644
--- a/recipes/discover_magazine.recipe
+++ b/recipes/discover_magazine.recipe
@@ -112,23 +112,13 @@ class DiscoverMagazine(BasicNewsRecipe):
return soup
feeds = [
- (u'Technology', u'http://discovermagazine.com/topics/technology/rss.xml'),
- (u'Health - Medicine',
- u'http://discovermagazine.com/topics/health-medicine/rss.xml'),
- (u'Mind Brain', u'http://discovermagazine.com/topics/mind-brain/rss.xml'),
- (u'Space', u'http://discovermagazine.com/topics/space/rss.xml'),
- (u'Human Origins', u'http://discovermagazine.com/topics/human-origins/rss.xml'),
- (u'Living World', u'http://discovermagazine.com/topics/living-world/rss.xml'),
- (u'Environment', u'http://discovermagazine.com/topics/environment/rss.xml'),
- (u'Physics & Math', u'http://discovermagazine.com/topics/physics-math/rss.xml'),
+ (u'Technologiy', u'http://feeds.feedburner.com/DiscoverTechnology'),
+ (u'Health & Medicine', u'http://feeds.feedburner.com/DiscoverHealthMedicine'),
+ (u'Mind Brain', u'http://feeds.feedburner.com/DiscoverMindBrain'),
+ (u'Space & Physics', u'http://feeds.feedburner.com/DiscoverSpace'),
+ (u'Living World', u'http://feeds.feedburner.com/DiscoverLivingWorld'),
+ (u'Environment', u'http://feeds.feedburner.com/DiscoverEnvironment'),
(u"20 Things you didn't know about...",
- u'http://discovermagazine.com/columns/20-things-you-didnt-know/rss.xml'),
- (u'Fuzzy Math', u'http://discovermagazine.com/columns/fuzzy-math/rss.xml'),
- (u'The Brain', u'http://discovermagazine.com/columns/the-brain/rss.xml'),
- (u'What is This', u'http://discovermagazine.com/columns/what-is-this/rss.xml'),
- (u'Vital Signs', u'http://discovermagazine.com/columns/vital-signs/rss.xml'),
- (u'Think Tech', u'http://discovermagazine.com/columns/think-tech/rss.xml'),
- (u'Future Tech', u'http://discovermagazine.com/columns/future-tech/rss.xml'),
- (u'Discover Interview',
- u'http://discovermagazine.com/columns/discover-interview/rss.xml'),
+ u'http://feeds.feedburner.com/20ThingsYouDidntKnowAbout'),
+ (u'Vital Signs', u'http://feeds.feedburner.com/discovermagazine/VitalSigns'),
]
diff --git a/recipes/editor_and_publisher.recipe b/recipes/editor_and_publisher.recipe
index aa188d5dfc..57342cbc95 100644
--- a/recipes/editor_and_publisher.recipe
+++ b/recipes/editor_and_publisher.recipe
@@ -18,9 +18,12 @@ class EandP(BasicNewsRecipe):
encoding = 'utf8'
cover_url = 'http://www.editorandpublisher.com/images/EP_main_logo.gif'
remove_javascript = True
+ auto_cleanup = True
html2lrf_options = [
- '--comment', description, '--category', category, '--publisher', publisher
+ '--comment', description,
+ '--category', category,
+ '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + \
@@ -34,21 +37,11 @@ class EandP(BasicNewsRecipe):
h2{font-size: large;}
'''
- # Keep only div:itemmgap
-
- keep_only_tags = [
- dict(name='div', attrs={'class': 'itemmgap'})
- ]
-
# Remove commenting/social media lins
remove_tags_after = [dict(name='div', attrs={'class': 'clear'})]
- feeds = [(u'Breaking News', u'http://www.editorandpublisher.com/GenerateRssFeed.aspx'),
- (u'Business News',
- u'http://www.editorandpublisher.com/GenerateRssFeed.aspx?CategoryId=2'),
- (u'Ad/Circ News', u'http://www.editorandpublisher.com/GenerateRssFeed.aspx?CategoryId=3'),
- (u'Newsroom', u'http://www.editorandpublisher.com/GenerateRssFeed.aspx?CategoryId=4'),
- (u'Technology News',
- u'http://www.editorandpublisher.com/GenerateRssFeed.aspx?CategoryId=5'),
- (u'Syndicates News', u'http://www.editorandpublisher.com/GenerateRssFeed.aspx?CategoryId=7')]
+ feeds = [
+ (u'Editor & Publisher', u'http://www.editorandpublisher.com/feed/'),
+ (u'Comments', u'http://www.editorandpublisher.com/comments/feed/'),
+ ]
diff --git a/recipes/everett_herald.recipe b/recipes/everett_herald.recipe
index 5c20058022..fe9b017f96 100644
--- a/recipes/everett_herald.recipe
+++ b/recipes/everett_herald.recipe
@@ -5,32 +5,9 @@ class AdvancedUserRecipe1295088390(BasicNewsRecipe):
title = u'Everett Herald'
language = 'en'
__author__ = '77ja65'
- oldest_article = 4
+ oldest_article = 7
max_articles_per_feed = 50
no_stylesheets = True
- masthead_url = 'http://heraldnet.com/images/hnet/jQueryComponents/jQueryNavigation/heraldnet_logo.png'
- extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
+ auto_cleanup = True
- feeds = [(u'Local News',
- u'http://heraldnet.com/section/RSS02&mime=xml'),
- (u'Sports', u'http://heraldnet.com/section/RSS04&mime=xml'),
- (u'Entertainment',
- u'http://heraldnet.com/section/RSS07&mime=xml'),
- (u'Life', u'http://heraldnet.com/section/RSS03&mime=xml'),
- (u'Breaking News',
- u'http://heraldnet.com/section/RSS34&mime=xml'),
- (u'Seahawks', u'http://heraldnet.com/section/RSS22&mime=xml'),
- (u'HeraldNet', u'http://heraldnet.com/section/RSS01&mime=xml'),
- (u'Inside Everett',
- u'http://heraldnet.com/section/RSS26&mime=xml')
- ]
-
- def print_version(self, url):
- return url + "&template=PrinterFriendly"
-
- extra_css = '''
- h1{font-family:Arial,Helvetica,sans-serif; font-
- weight:bold;font-size:large;}
- h2{font-family:Arial,Helvetica,sans-serif; font-
- weight:normal;font-size:small;}
- '''
+ feeds = [(u'Local News', u'http://www.heraldnet.com/feed/')]
diff --git a/recipes/fairbanks_daily.recipe b/recipes/fairbanks_daily.recipe
index f80cd7779c..6435406cb5 100644
--- a/recipes/fairbanks_daily.recipe
+++ b/recipes/fairbanks_daily.recipe
@@ -6,101 +6,29 @@ class FairbanksDailyNewsminer(BasicNewsRecipe):
__author__ = 'Roger'
oldest_article = 7
max_articles_per_feed = 100
-
description = 'The voice of interior Alaska since 1903'
publisher = 'http://www.newsminer.com/'
category = 'news, Alaska, Fairbanks'
language = 'en'
-
- # Make article titles, author and date bold, italic or small font.
- # http://assets.matchbin.com/sites/635/stylesheets/newsminer.com.css
- # (signature_line contains date, views, comments)
- extra_css = '''
- .story_item_headline { font-size: medium; font-weight: bold; }
- .story_item_author { font-size: small; font-style:italic; }
- .signature_line { font-size: small; }
- '''
-
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
language = 'en'
encoding = 'utf8'
conversion_options = {'linearize_tables': True}
+ auto_cleanup = True
- # TODO: The News-miner cover image seems a bit small. Can this be
- # enlarged by 10-30%?
- masthead_url = 'http://d2uh5w9wm14i0w.cloudfront.net/sites/635/assets/top_masthead_-_menu_pic.jpg'
-
- # In order to omit seeing number of views, number of posts and the pipe
- # symbol for divider after the title and date of the article, a regex or
- # manual processing is needed to get just the "story_item_date updated"
- # (which contains the date). Everything else on this line is pretty much not needed.
- #
- # Currently, you will see the following:
- # | Aug 24, 2011 | 654 views | 6 | |
- # (ie. 6 comments)
- #
-
- # The following was suggested, but it looks like I also need to define self & soup
- # (as well as bring in extra soup depends?)
- # date = self.tag_to_string(soup.find('span', attrs={'class':'story_item_date updated'}))
-
- # preprocess_regexps = [(re.compile(r']*addthis_separator*>'), lambda match: '') ]
- # preprocess_regexps = [(re.compile(r'span class="addthis_separator">|'), lambda match: '') ]
-
- # preprocess_regexps = [
- # (re.compile(r'.*?', re.IGNORECASE | re.DOTALL), lambda match : ''),
- # ]
-
- # def get_browser(self):
- # def preprocess_html(soup, first_fetch):
- # date = self.tag_to_string(soup.find('span', attrs={'class':'story_item_date updated'}))
- # return
-
- # preprocess_regexps = [(re.compile(r' |.*?', re.DOTALL), lambda m: '')]
-
- keep_only_tags = [
- dict(name='div', attrs={'class': 'story_item_headline entry-title'}),
- dict(name='div', attrs={'class': 'full_story'})
- ]
-
- remove_tags = [
- # Try getting rid of some signature_line (date line) stuff
- dict(name='img', attrs={'class': 'dont_touch_me'}),
- dict(name='span', attrs={
- 'class': 'number_recommendations'}),
-
- # Removes div within
- dict(name='div', attrs={
- 'class': 'addthis_toolbox addthis_default_style'}),
-
- dict(name='div', attrs={'class': 'related_content'}),
- dict(name='div', attrs={'id': 'comments_container'})
- ]
-
- # Comment-out or uncomment any of the following RSS feeds according to your
- # liking.
- #
- # TODO: Some random bits of text might be trailing the last page (or TOC on
- # MOBI files), these are bits of public posts and comments and need to also
- # be removed.
- #
feeds = [
- (u'Alaska News', u'http://newsminer.com/rss/rss_feeds/alaska_news?content_type=article&tags=alaska_news&page_name=rss_feeds&instance=alaska_news'),
- (u'Local News', u'http://newsminer.com/rss/rss_feeds/local_news?content_type=article&tags=local_news&page_name=rss_feeds&offset=0&instance=local_news'),
- (u'Business', u'http://newsminer.com/rss/rss_feeds/business_news?content_type=article&tags=business_news&page_name=rss_feeds&instance=business_news'),
- (u'Politics', u'http://newsminer.com/rss/rss_feeds/politics_news?content_type=article&tags=politics_news&page_name=rss_feeds&instance=politics_news'),
- (u'Sports', u'http://newsminer.com/rss/rss_feeds/sports_news?content_type=article&tags=sports_news&page_name=rss_feeds&instance=sports_news'),
- (u'Latitude 65 feed', u'http://newsminer.com/rss/rss_feeds/latitude_65?content_type=article&tags=latitude_65&page_name=rss_feeds&offset=0&instance=latitude_65'), # noqa
- # (u'Sundays', u'http://newsminer.com/rss/rss_feeds/Sundays?content_type=article&tags=alaska_science_forum+scott_mccrea+interior_gardening+in_the_bush+judy_ferguson+book_reviews+theresa_bakker+judith_kleinfeld+interior_scrapbook+nuggets_comics+freeze_frame&page_name=rss_feeds&tag_inclusion=or&instance=Sundays'), # noqa
- (u'Outdoors', u'http://newsminer.com/rss/rss_feeds/Outdoors?content_type=article&tags=outdoors&page_name=rss_feeds&instance=Outdoors'),
- # (u'Fairbanks Grizzlies', u'http://newsminer.com/rss/rss_feeds/fairbanks_grizzlies?content_type=article&tags=fairbanks_grizzlies&page_name=rss_feeds&instance=fairbanks_grizzlies'), # noqa
- # (u'Newsminer', u'http://newsminer.com/rss/rss_feeds/Newsminer?content_type=article&tags=ted_stevens_bullets+ted_stevens+sports_news+business_news+fairbanks_grizzlies+dermot_cole_column+outdoors+alaska_science_forum+scott_mccrea+interior_gardening+in_the_bush+judy_ferguson+book_reviews+theresa_bakker+judith_kleinfeld+interior_scrapbook+nuggets_comics+freeze_frame&page_name=rss_feeds&tag_inclusion=or&instance=Newsminer'), # noqa
- (u'Opinion', u'http://newsminer.com/rss/rss_feeds/Opinion?content_type=article&tags=editorials&page_name=rss_feeds&instance=Opinion'),
- (u'Youth', u'http://newsminer.com/rss/rss_feeds/Youth?content_type=article&tags=youth&page_name=rss_feeds&instance=Youth'),
- # (u'Dermot Cole Blog', u'http://newsminer.com/rss/rss_feeds/dermot_cole_blog+rss?content_type=blog+entry&sort_by=posted_on&user_ids=3015275&page_name=blogs_dermot_cole&limit=10&instance=dermot_cole_blog+rss'), # noqa
- (u'Dermot Cole Column', u'http://newsminer.com/rss/rss_feeds/Dermot_Cole_column?content_type=article&tags=dermot_cole_column&page_name=rss_feeds&instance=Dermot_Cole_column'), # noqa
- # (u'Sarah Palin', u'http://newsminer.com/rss/rss_feeds/sarah_palin?content_type=article&tags=palin_in_the_news+palin_on_the_issues&page_name=rss_feeds&tag_inclusion=or&instance=sarah_palin') # noqa
+ (u'Alaska News',
+ u'http://www.newsminer.com/search/?f=rss&t=article&c=news/alaska_news&l=50&s=start_time&sd=desc'),
+ (u'Local News',
+ u'http://www.newsminer.com/search/?f=rss&t=article&c=news/local_news&l=50&s=start_time&sd=desc'),
+ (u'Business',
+ u'http://www.newsminer.com/search/?f=rss&t=article&c=business&l=50&s=start_time&sd=desc'),
+ (u'Politics',
+ u'http://www.newsminer.com/search/?f=rss&t=article&c=news/politics&l=50&s=start_time&sd=desc'),
+ (u'Sports',
+ u'http://www.newsminer.com/search/?f=rss&t=article&c=sports&l=50&s=start_time&sd=desc'),
+ (u'Opinion',
+ u'http://www.newsminer.com/search/?f=rss&t=article&c=opinion&l=50&s=start_time&sd=desc'),
]
diff --git a/recipes/fan_graphs.recipe b/recipes/fan_graphs.recipe
index 67e536e1f5..892f6699bf 100644
--- a/recipes/fan_graphs.recipe
+++ b/recipes/fan_graphs.recipe
@@ -20,6 +20,7 @@ class FanGraphs(BasicNewsRecipe):
category = 'Baseball'
language = 'en'
publication_type = 'Blog'
+ auto_cleanup = True
description = 'Baseball statistical analysis, graphs, and projections.'
__author__ = 'David Appelman'
@@ -27,9 +28,8 @@ class FanGraphs(BasicNewsRecipe):
feeds = [
(u'Fangraphs', u'http://feeds.feedburner.com/FanGraphs?format=xml'),
- (u'Rotographs', u'http://www.wizardrss.com/feed/feeds.feedburner.com/RotoGraphs?format=xml'),
- (u'Community', u'http://www.wizardrss.com/feed/www.fangraphs.com/community/?feed=rss2'),
- (u'NotGraphs', u'http://www.wizardrss.com/feed/www.fangraphs.com/not/?feed=rss2')]
+ (u'Rotographs', u'http://feeds.feedburner.com/RotoGraphs?format=xml'),
+ (u'NotGraphs', u'http://feeds.feedburner.com/NotGraphs?format=xml')]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
diff --git a/recipes/glamour.recipe b/recipes/glamour.recipe
index 0c33066447..d53edb4176 100644
--- a/recipes/glamour.recipe
+++ b/recipes/glamour.recipe
@@ -10,37 +10,27 @@ class AdvancedUserRecipe1305547242(BasicNewsRecipe):
language = 'en'
remove_javascript = True
__author__ = 'Anonymous'
- remove_tags = [
- dict(name='div', attrs={'class': 'articles_footer', 'class': 'printoptions'})]
+ auto_cleanup = True
- def print_version(self, url):
- return url + '?printable=true'
-
- def preprocess_html(self, soup):
- for alink in soup.findAll('a'):
- if alink.string is not None:
- tstr = alink.string
- alink.replaceWith(tstr)
- return soup
-
- feeds = [ (u'All Fashion', u'http://feeds.glamour.com/glamour/all_fashion'),
- (u'All Beauty', u'http://feeds.glamour.com/glamour/all_beauty'),
- (u'All Sex, Love & Life',
- u'http://feeds.glamour.com/glamour/sex_love_life'),
- (u'All Health & Fitness',
- u'http://feeds.glamour.com/glamour/health_fitness'),
- (u'Shopping', u'http://feeds.glamour.com/glamour/shopping'),
- (u'Slaves to Fashion blog',
- u'http://feeds.glamour.com/glamour/slavestofashion'),
- (u'The Girls in the Beauty Department',
- u'http://feeds.glamour.com/glamour/thegirlsinthebeautydepartment'),
- (u'Smitten blog', u'http://feeds.glamour.com/glamour/smitten'),
- (u'Save the Date', u'http://feeds.feedburner.com/glamour/save-the-date'),
- (u'Single-ish blog', u'http://feeds.glamour.com/glamour/glamoursingle-ish'),
- (u'Save the Date', u'http://feeds.feedburner.com/glamour/save-the-date'),
- (u'Vitamin G blog', u'http://feeds.glamour.com/glamour/vitamin-g'),
- (u'Margarita Shapes Up blog',
- u'http://feeds.glamour.com/glamour/margaritashapesup'),
- (u'Little Miss Fortune blog',
- u'http://feeds.glamour.com/glamour/little-miss-fortune'),
- ]
+ feeds = [
+ (u'All Fashion',
+ u'http://feeds.glamour.com/glamour/all_fashion'),
+ (u'All Beauty',
+ u'http://feeds.glamour.com/glamour/all_beauty'),
+ (u'All Sex, Love & Life',
+ u'http://feeds.glamour.com/glamour/sex_love_life'),
+ (u'All Health & Fitness',
+ u'http://feeds.glamour.com/glamour/health_fitness'),
+ (u'Slaves to Fashion blog',
+ u'http://feeds.glamour.com/glamour/slavestofashion'),
+ (u'The Girls in the Beauty Department',
+ u'http://feeds.glamour.com/glamour/thegirlsinthebeautydepartment'),
+ (u'Smitten blog',
+ u'http://feeds.glamour.com/glamour/smitten'),
+ (u'Save the Date',
+ u'http://feeds.feedburner.com/glamour/save-the-date'),
+ (u'Save the Date',
+ u'http://feeds.feedburner.com/glamour/save-the-date'),
+ (u'Vitamin G blog',
+ u'http://feeds.glamour.com/glamour/vitamin-g'),
+ ]
diff --git a/recipes/greensboro_news_and_record.recipe b/recipes/greensboro_news_and_record.recipe
index 858f42f387..08e64a4fc1 100644
--- a/recipes/greensboro_news_and_record.recipe
+++ b/recipes/greensboro_news_and_record.recipe
@@ -19,29 +19,19 @@ class NewsandRecord(BasicNewsRecipe):
encoding = 'utf-8'
remove_javascript = True
no_stylesheets = True
+ auto_cleanup = True
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
- remove_tags_before = dict(name='h3', attrs={'class': 'nrcTxt_headline'})
- remove_tags_after = dict(name='div', attrs={'id': 'nrcBlk_ContentBody'})
-
- remove_tags = [
- dict(name='iframe'),
- dict(name=['notags', 'embed', 'object', 'link', 'img']),
-
- ]
-
feeds = [
- ('News', 'http://www.news-record.com/news/archive/feed'),
- ('Greensboro News', 'http://www.news-record.com/news/greensboro/feed'),
- ('Education', 'http://www.news-record.com/news/education/feed'),
- ('Government', 'http://www.news-record.com/news/government/feed'),
- ('College Sports', 'http://www.news-record.com/sports/college/feed'),
- ('Sports Extra', 'http://www.news-record.com/blog/sportsextra/feed'),
- ('Life', 'http://www.news-record.com/life/top/feed'),
- ('NASCAR', 'http://www.news-record.com/sports/nascar/top/feed'),
- ('Editorials', 'http://www.news-record.com/opinion/editorials/feed'),
- ('Letters to the Editor', 'http://www.news-record.com/opinion/letters/feed')
+ ('News', 'http://www.greensboro.com/search/?q=&t=article&l=10&d=&d1=&d2=&s=start_time&sd=desc&c[]=news,news/*&f=rss'),
+ ('Greensboro News', 'http://www.greensboro.com/search/?q=&t=article&l=100&d=&d1=&d2=&s=start_time&sd=desc&nsa=eedition&c[]=news/local,news/crime,news/goverment,news/schools,news/rockingham_county,news/local,news/crime,news/goverment,news/schools,news/rockingham_county/*&f=rss'),
+ ('Business', 'http://www.greensboro.com/search/?q=&t=article&l=10&d=&d1=&d2=&s=start_time&sd=desc&c[]=business,business/*&f=rss'),
+ ('Local Business', 'http://www.greensboro.com/search/?q=&t=article&l=100&d=&d1=&d2=&s=start_time&sd=desc&nsa=eedition&c[]=business/local_business,business/local_business/*&f=rss'),
+ ('Sports', 'http://www.greensboro.com/search/?q=&t=article&l=10&d=&d1=&d2=&s=start_time&sd=desc&c[]=sports,sports/*&f=rss'),
+ ('College Sports', 'http://www.greensboro.com/search/?q=&t=article&l=100&d=&d1=&d2=&s=start_time&sd=desc&nsa=eedition&c[]=sports/college,sports/college/*&f=rss'),
+ ('Sports Extra', 'http://www.greensboro.com/search/?q=&t=article&l=100&d=&d1=&d2=&s=start_time&sd=desc&nsa=eedition&c[]=blogs/sports_extra,blogs/sports_extra/*&f=rss'),
+ ('Life', 'http://www.greensboro.com/search/?q=&t=article&l=10&d=&d1=&d2=&s=start_time&sd=desc&c[]=life,life/*&f=rss'),
]
diff --git a/recipes/hartford_courant.recipe b/recipes/hartford_courant.recipe
index 994cda3a9f..c2fa3fe92d 100644
--- a/recipes/hartford_courant.recipe
+++ b/recipes/hartford_courant.recipe
@@ -12,36 +12,11 @@ class ChicagoTribune(BasicNewsRecipe):
__author__ = 'Being and Sujata Raman'
description = 'Politics, local and business news from Hartford'
language = 'en'
-
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
+ auto_cleanup = True
- keep_only_tags = [dict(name='div', attrs={'class': ["story", "entry-asset asset hentry"]}),
- dict(name='div', attrs={
- 'id': ["pagebody", "story", "maincontentcontainer"]}),
- ]
- remove_tags_after = [{'class': ['photo_article', ]}]
-
- remove_tags = [
- {'id': ["moduleArticleTools", "content-bottom", "rail", "articleRelates module", "toolSet", "relatedrailcontent", "div-wrapper", "beta", "atp-comments", "footer"]}, # noqa
- {'class': ["clearfix", "relatedTitle", "articleRelates module", "asset-footer", "tools", "comments",
- "featurePromo", "featurePromo fp-topjobs brownBackground", "clearfix fullSpan brownBackground", "curvedContent"]},
- dict(name='font', attrs={'id': ["cr-other-headlines"]})]
- extra_css = '''
- h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
- h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
- .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
- .date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
- p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
- .copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center}
- .story{font-family:Arial,Helvetica,sans-serif;font-size:small;}
- .entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;}
- .pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;}
- .maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;}
- .story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
- body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
- '''
feeds = [
('Breaking News', 'http://feeds.feedburner.com/courant-breaking-news/'),
('Nation/World News', 'http://feeds.feedburner.com/courant-nation-world/'),
@@ -64,29 +39,9 @@ class ChicagoTribune(BasicNewsRecipe):
('Music', 'http://feeds.feedburner.com/courant-music/'),
('TV', 'http://feeds.feedburner.com/courant-tv/'),
('Movies', 'http://feeds.feedburner.com/courant-movies/'),
- # ('Metromix headlines', 'http://feeds.feedburner.com/metromix/topheadlines/'),
- # ('Metromix events', 'http://feeds.feedburner.com/metromix/events/'),
- # ('Metromix restaurants', 'http://feeds.feedburner.com/metromix/restaurants/'),
('Outdoors', 'http://feeds.feedburner.com/courant-outdoors/'),
('Peter Marteka', 'http://feeds.feedburner.com/courant-marteka-column/'),
- ('Susan Campbell', 'http://feeds.feedburner.com/courant-campbell-column/'),
- ('Helen Ubinas', 'http://feeds.feedburner.com/courant-helen-ubinas-column/'),
('Jim Shea', 'http://feeds.feedburner.com/courant-jim-shea-column/'),
('Tom Condon', 'http://feeds.feedburner.com/courant-tom-condon-column/'),
('Colin McEnroe', 'http://feeds.feedburner.com/courant-colin-mcenroe-column/'),
]
-
- def get_article_url(self, article):
- print article.get('feedburner_origlink', article.get('guid', article.get('link')))
- return article.get('feedburner_origlink', article.get('guid', article.get('link')))
-
- def postprocess_html(self, soup, first_fetch):
- for t in soup.findAll(['table', 'tr', 'td']):
- t.name = 'div'
-
- for tag in soup.findAll('form', dict(attrs={'name': ["comments_form"]})):
- tag.extract()
- for tag in soup.findAll('font', dict(attrs={'id': ["cr-other-headlines"]})):
- tag.extract()
-
- return soup
diff --git a/recipes/icons/digital_arts.png b/recipes/icons/digital_arts.png
new file mode 100644
index 0000000000..89e7a619c0
Binary files /dev/null and b/recipes/icons/digital_arts.png differ
diff --git a/recipes/icons/heritage_foundation.png b/recipes/icons/heritage_foundation.png
new file mode 100644
index 0000000000..190a36c8e5
Binary files /dev/null and b/recipes/icons/heritage_foundation.png differ