mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Fixed varous recipes that had broken/dead feed links.
This commit is contained in:
parent
d2eb1426b0
commit
445955a537
@ -24,65 +24,39 @@ class BaltimoreSun(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
|
auto_cleanup = False
|
||||||
|
|
||||||
ignore_duplicate_articles = {'title'}
|
ignore_duplicate_articles = {'title'}
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name=['div', 'section'], attrs={'class': [
|
dict(name=['div'], attrs={'class': ['trb_ar_hl', 'trb_ar_hl_t',
|
||||||
"trb_article_title", "trb_article_leadart", 'trb_bylines', 'trb_article_dateline', 'trb_mainContent']}),
|
'trb_ar_la', 'trb_ar_by',
|
||||||
|
'trb_ar_dateline', 'trb_ar_page']}),
|
||||||
]
|
]
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['meta', 'link']),
|
dict(name=['meta', 'link']),
|
||||||
dict(name=['div', 'aside'], attrs={'class': lambda x: x and set(x.split()).intersection({
|
|
||||||
'trb_gptAd', 'trb_panelmod_container', 'trb_socialize', 'trb_taboola', 'trb_embed_related'})}),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
for img in soup.findAll('img'):
|
|
||||||
img['src'] = img['data-baseurl']
|
|
||||||
return soup
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
# News ##
|
# News ##
|
||||||
(u'Top Headlines', u'http://feeds.feedburner.com/baltimoresun/news/rss2'),
|
(u'Top Headlines', u'http://feeds.feedburner.com/baltimoresun/news/rss2'),
|
||||||
(u'Breaking News', u'http://feeds.feedburner.com/baltimoresun/news/local/annearundel/rss2'),
|
(u'Breaking News', u'http://feeds.feedburner.com/baltimoresun/news/local/annearundel/rss2'),
|
||||||
(u'Top Maryland', u'http://feeds.feedburner.com/baltimoresun/news/local/rss2'),
|
(u'Top Maryland', u'http://feeds.feedburner.com/baltimoresun/news/local/rss2'),
|
||||||
# (u'Anne Arundel County', u'http://feeds.feedburner.com/baltimoresun/news/local/annearundel/rss2'),
|
|
||||||
(u'Baltimore City', u'http://feeds.feedburner.com/baltimoresun/news/local/baltimore_city/rss20xml'),
|
(u'Baltimore City', u'http://feeds.feedburner.com/baltimoresun/news/local/baltimore_city/rss20xml'),
|
||||||
# (u'Baltimore County', u'http://feeds.feedburner.com/baltimoresun/news/local/baltimore_county/rss2'),
|
|
||||||
# (u'Carroll County', u'http://feeds.feedburner.com/baltimoresun/news/local/carroll/rss2'),
|
|
||||||
# (u'Harford County', u'http://feeds.feedburner.com/baltimoresun/news/local/harford/rss2),
|
|
||||||
# (u'Howard County', u'http://feeds.feedburner.com/baltimoresun/news/local/howard/rss2'),
|
|
||||||
(u'Education', u'http://feeds.feedburner.com/baltimoresun/news/education/rss2'),
|
(u'Education', u'http://feeds.feedburner.com/baltimoresun/news/education/rss2'),
|
||||||
# (u'Obituaries', u'http://feeds.feedburner.com/baltimoresun/news/obituaries/rss2'),
|
(u'Local Politics', u'http://feeds.feedburner.com/baltimoresun/news/local/politics/rss2'),
|
||||||
(u'Local Politics',
|
|
||||||
u'http://feeds.feedburner.com/baltimoresun/news/local/politics/rss2'),
|
|
||||||
(u'Weather', u'http://feeds.feedburner.com/baltimoresun/news/weather/site/rss2'),
|
(u'Weather', u'http://feeds.feedburner.com/baltimoresun/news/weather/site/rss2'),
|
||||||
# (u'Traffic', u'http://feeds.feedburner.com/baltimoresun/news/traffic/rss2'),
|
|
||||||
(u'Nation/world', u'http://feeds.feedburner.com/baltimoresun/news/nationworld/rss2'),
|
(u'Nation/world', u'http://feeds.feedburner.com/baltimoresun/news/nationworld/rss2'),
|
||||||
# (u'Weird News', u'http://feeds.feedburner.com/baltsun-weirdnews'),
|
|
||||||
|
|
||||||
# Sports##
|
# Sports##
|
||||||
(u'Top Sports', u'http://feeds.feedburner.com/baltimoresun/sports/rss2'),
|
(u'Top Sports', u'http://feeds.feedburner.com/baltimoresun/sports/rss2'),
|
||||||
(u'Orioles/Baseball', u'http://www.baltimoresun.com/sports/orioles/rss2.0.xml'),
|
(u'Orioles/Baseball', u'http://www.baltimoresun.com/sports/orioles/rss2.0.xml'),
|
||||||
(u'Ravens/Football',
|
(u'Ravens/Football', u'http://feeds.feedburner.com/baltimoresun/sports/football/rss2'),
|
||||||
u'http://feeds.feedburner.com/baltimoresun/sports/football/rss2'),
|
|
||||||
# (u'Terps', u''http://feeds.feedburner.com/baltimoresun/sports/terps/rss2'),
|
|
||||||
# (u'College Football', u''feed://feeds.feedburner.com/baltimoresun/sports/college/football/rss2'),
|
|
||||||
# (u'Lacrosse', u'http://feeds.feedburner.com/baltimoresun/sports/college/lacrosse/rss2'),
|
|
||||||
# (u'Horse Racing', u'http://feeds.feedburner.com/baltimoresun/sports/horseracing/rss2'),
|
|
||||||
# (u'Golf', u'http://feeds.feedburner.com/baltimoresun/sports/golf/rss2'),
|
|
||||||
# (u'NBA', u'http://feeds.feedburner.com/baltimoresun/sports/basketball/rss2'),
|
|
||||||
# (u'High School', u'http://feeds.feedburner.com/baltimoresun/sports/highschool/rss2'),
|
|
||||||
# (u'Outdoors', u'http://feeds.feedburner.com/baltimoresun/sports/outdoors/rss2'),
|
|
||||||
|
|
||||||
# Entertainment ##
|
# Entertainment ##
|
||||||
(u'Celebrity News', u'http://baltimore.feedsportal.com/c/34255/f/623042/index.rss'),
|
|
||||||
(u'Arts & Theater', u'http://feeds.feedburner.com/baltimoresun/entertainment/galleriesmuseums/rss2'),
|
(u'Arts & Theater', u'http://feeds.feedburner.com/baltimoresun/entertainment/galleriesmuseums/rss2'),
|
||||||
(u'Movies', u'http://www.baltimoresun.com/entertainment/movies/rss2.0.xml'),
|
(u'Movies', u'http://www.baltimoresun.com/entertainment/movies/rss2.0.xml'),
|
||||||
(u'Music & Nightlife',
|
(u'Music & Nightlife', u'http://www.baltimoresun.com/entertainment/music/rss2.0.xml'),
|
||||||
u'http://www.baltimoresun.com/entertainment/music/rss2.0.xml'),
|
(u'Restaurants & Food', u'http://www.baltimoresun.com/entertainment/dining/rss2.0.xml'),
|
||||||
(u'Restaurants & Food',
|
|
||||||
u'http://www.baltimoresun.com/entertainment/dining/rss2.0.xml'),
|
|
||||||
(u'TV/Media', u'http://www.baltimoresun.com/entertainment/tv/rss2.0.xml'),
|
(u'TV/Media', u'http://www.baltimoresun.com/entertainment/tv/rss2.0.xml'),
|
||||||
|
|
||||||
# Life ##
|
# Life ##
|
||||||
@ -91,104 +65,42 @@ class BaltimoreSun(BasicNewsRecipe):
|
|||||||
(u'Living Green', u'http://www.baltimoresun.com/features/green/rss2.0.xml'),
|
(u'Living Green', u'http://www.baltimoresun.com/features/green/rss2.0.xml'),
|
||||||
(u'Fashion', u'http://www.baltimoresun.com/features/fashion/rss2.0.xml'),
|
(u'Fashion', u'http://www.baltimoresun.com/features/fashion/rss2.0.xml'),
|
||||||
(u'Travel', u'http://www.baltimoresun.com/travel/rss2.0.xml'),
|
(u'Travel', u'http://www.baltimoresun.com/travel/rss2.0.xml'),
|
||||||
# (u'Faith', u'http://www.baltimoresun.com/features/faith/rss2.0.xml'),
|
|
||||||
|
|
||||||
# Business ##
|
# Business ##
|
||||||
(u'Top Business', u'http://www.baltimoresun.com/business/rss2.0.xml'),
|
(u'Top Business', u'http://www.baltimoresun.com/business/rss2.0.xml'),
|
||||||
(u'Technology', u'http://www.baltimoresun.com/business/technology/rss2.0.xml'),
|
(u'Technology', u'http://www.baltimoresun.com/business/technology/rss2.0.xml'),
|
||||||
(u'Personal finance', u'http://baltimore.feedsportal.com/c/34255/f/623057/index.rss'),
|
|
||||||
(u'Real Estate', u'http://www.baltimoresun.com/classified/realestate/rss2.0.xml'),
|
(u'Real Estate', u'http://www.baltimoresun.com/classified/realestate/rss2.0.xml'),
|
||||||
(u'Jobs', u'http://baltimore.feedsportal.com/c/34255/f/623059/index.rss'),
|
|
||||||
# (u'DIY', u'http://baltimore.feedsportal.com/c/34255/f/623060/index.rss'),
|
|
||||||
# (u'Consumer Safety', u'http://baltimore.feedsportal.com/c/34255/f/623061/index.rss'),
|
|
||||||
(u'Investing', u'http://www.baltimoresun.com/business/money/rss2.0.xml'),
|
(u'Investing', u'http://www.baltimoresun.com/business/money/rss2.0.xml'),
|
||||||
|
|
||||||
# Opinion##
|
# Opinion##
|
||||||
(u'Sun Editorials', u'http://www.baltimoresun.com/news/opinion/editorial/rss2.0.xml'),
|
(u'Sun Editorials', u'http://www.baltimoresun.com/news/opinion/editorial/rss2.0.xml'),
|
||||||
(u'Op/Ed', u'http://www.baltimoresun.com/news/opinion/oped/rss2.0.xml'),
|
(u'Op/Ed', u'http://www.baltimoresun.com/news/opinion/oped/rss2.0.xml'),
|
||||||
(u'Readers Respond', u'http://baltimore.feedsportal.com/c/34255/f/623065/index.rss'),
|
|
||||||
|
|
||||||
# Columnists ##
|
|
||||||
(u'Kevin Cowherd', u'http://www.baltimoresun.com/sports/bal-columnist-cowherd,0,6829726.columnist-rss2.0.xml'),
|
|
||||||
(u'Robert Ehrlich', u'http://www.baltimoresun.com/news/opinion/columnists/bal-columnist-ehrlich,0,1825227.columnist-rss2.0.xml'),
|
|
||||||
(u'Jacques Kelly', u'http://www.baltimoresun.com/news/maryland/bal-columnist-kelly,0,1154701.columnist-rss2.0.xml'),
|
|
||||||
(u'Marta H. Mossburg', u'http://www.baltimoresun.com/news/opinion/oped/bal-columnist-mossburg,0,7982155.columnist-rss2.0.xml'),
|
|
||||||
(u'Mike Preston', u'http://www.baltimoresun.com/sports/bal-columnist-preston,0,6169796.columnist-rss2.0.xml'),
|
|
||||||
(u'Susan Reimer', u'http://www.baltimoresun.com/news/opinion/bal-columnist-reimer,0,162466.columnist-rss2.0.xml'),
|
|
||||||
(u'Dan Rodricks', u'http://www.baltimoresun.com/news/maryland/bal-columnist-rodricks,0,7089843.columnist-rss2.0.xml'),
|
|
||||||
(u'Thomas F. Schaller', u'http://www.baltimoresun.com/news/opinion/columnists/bal-columnist-schaller,0,897397.columnist-rss2.0.xml'),
|
|
||||||
(u'Peter Schmuck', u'http://www.baltimoresun.com/sports/bal-columnist-schmuck,0,7485088.columnist-rss2.0.xml'),
|
|
||||||
|
|
||||||
# News Blogs ##
|
# News Blogs ##
|
||||||
(u'Baltimore Crime Beat',
|
|
||||||
u'http://baltimore.feedsportal.com/c/34255/f/623075/index.rss'),
|
|
||||||
(u'InsideEd', u'http://www.baltimoresun.com/news/maryland/education/blog/rss2.0.xml'),
|
(u'InsideEd', u'http://www.baltimoresun.com/news/maryland/education/blog/rss2.0.xml'),
|
||||||
(u'Maryland Politics',
|
(u'Maryland Politics', u'http://www.baltimoresun.com/news/maryland/politics/blog/rss2.0.xml'),
|
||||||
u'http://www.baltimoresun.com/news/maryland/politics/blog/rss2.0.xml'),
|
(u'Maryland Weather', u'http://www.baltimoresun.com/news/weather/weather-blog/rss2.0.xml'),
|
||||||
(u'Maryland Weather',
|
(u'Second Opinion', u'http://www.baltimoresun.com/news/opinion/second-opinion-blog/rss2.0.xml'),
|
||||||
u'http://www.baltimoresun.com/news/weather/weather-blog/rss2.0.xml'),
|
(u'Sun Investigates', u'http://www.baltimoresun.com/news/maryland/sun-investigates/rss2.0.xml'),
|
||||||
(u'Second Opinion',
|
|
||||||
u'http://www.baltimoresun.com/news/opinion/second-opinion-blog/rss2.0.xml'),
|
|
||||||
(u'Sun Investigates',
|
|
||||||
u'http://www.baltimoresun.com/news/maryland/sun-investigates/rss2.0.xml'),
|
|
||||||
(u'You Dont Say', u'http://www.baltimoresun.com/news/language-blog/rss2.0.xml'),
|
(u'You Dont Say', u'http://www.baltimoresun.com/news/language-blog/rss2.0.xml'),
|
||||||
|
|
||||||
# Business Blogs ##
|
# Business Blogs ##
|
||||||
(u'BaltTech', u'http://www.baltimoresun.com/business/technology/blog/rss2.0.xml'),
|
(u'BaltTech', u'http://www.baltimoresun.com/business/technology/blog/rss2.0.xml'),
|
||||||
(u'Consuming Interests',
|
(u'Consuming Interests', u'http://www.baltimoresun.com/business/consuming-interests-blog/rss2.0.xml'),
|
||||||
u'http://www.baltimoresun.com/business/consuming-interests-blog/rss2.0.xml'),
|
(u'The Real Estate Wonk', u'http://www.baltimoresun.com/business/real-estate/wonk/rss2.0.xml'),
|
||||||
(u'The Real Estate Wonk',
|
|
||||||
u'http://www.baltimoresun.com/business/real-estate/wonk/rss2.0.xml'),
|
|
||||||
|
|
||||||
# Entertainment Blogs ##
|
# Entertainment Blogs ##
|
||||||
(u'ArtSmash', 'http://www.baltimoresun.com/entertainment/arts/artsmash/rss2.0.xml'),
|
(u'ArtSmash', 'http://www.baltimoresun.com/entertainment/arts/artsmash/rss2.0.xml'),
|
||||||
(u'Baltimore Diner', u'http://baltimore.feedsportal.com/c/34255/f/623088/index.rss'),
|
|
||||||
(u'Midnight Sun', u'http://www.baltimoresun.com/entertainment/music/midnight-sun-blog/rss2.0.xml'),
|
(u'Midnight Sun', u'http://www.baltimoresun.com/entertainment/music/midnight-sun-blog/rss2.0.xml'),
|
||||||
(u'Read Street', u'http://www.baltimoresun.com/features/books/read-street/rss2.0.xml'),
|
(u'Read Street', u'http://www.baltimoresun.com/features/books/read-street/rss2.0.xml'),
|
||||||
(u'Z on TV', u'http://www.baltimoresun.com/entertainment/tv/z-on-tv-blog/rss2.0.xml'),
|
(u'Z on TV', u'http://www.baltimoresun.com/entertainment/tv/z-on-tv-blog/rss2.0.xml'),
|
||||||
|
|
||||||
# Life Blogs ##
|
# Life Blogs ##
|
||||||
# (u'BMore Green', u'http://weblogs.baltimoresun.com/features/green/index.xml'),
|
(u'Baltimore Insider', u'http://www.baltimoresun.com/features/baltimore-insider-blog/rss2.0.xml'),
|
||||||
(u'Baltimore Insider',
|
|
||||||
u'http://www.baltimoresun.com/features/baltimore-insider-blog/rss2.0.xml'),
|
|
||||||
(u'Picture of Health', u'http://www.baltimoresun.com/health/blog/rss2.0.xml'),
|
(u'Picture of Health', u'http://www.baltimoresun.com/health/blog/rss2.0.xml'),
|
||||||
# (u'Unleashed', u'http://weblogs.baltimoresun.com/features/mutts/blog/index.xml'),
|
|
||||||
|
|
||||||
# b the site blogs ##
|
|
||||||
(u'TV Lust', u'http://baltimore.feedsportal.com/c/34255/f/623096/index.rss'),
|
|
||||||
|
|
||||||
# Sports Blogs ##
|
# Sports Blogs ##
|
||||||
(u'Baltimore Sports Blitz',
|
|
||||||
u'http://baltimore.feedsportal.com/c/34255/f/623097/index.rss'),
|
|
||||||
# (u'Lacrosse Insider',u'http://www.baltimoresun.com/sports/lacrosse-blog/rss2.0.xml'),
|
|
||||||
(u'Orioles Insider', u'http://baltimore.feedsportal.com/c/34255/f/623100/index.rss'),
|
(u'Orioles Insider', u'http://baltimore.feedsportal.com/c/34255/f/623100/index.rss'),
|
||||||
(u'Ravens Insider',
|
(u'Ravens Insider', u'http://www.baltimoresun.com/sports/ravens/ravens-insider/rss2.0.xml'),
|
||||||
u'http://www.baltimoresun.com/sports/ravens/ravens-insider/rss2.0.xml'),
|
(u'The Schmuck Stops Here', u'http://www.baltimoresun.com/sports/schmuck-blog/rss2.0.xml'),
|
||||||
# (u'Ring Posts', u'http://weblogs.baltimoresun.com/sports/wrestling/blog/index.xml'),
|
|
||||||
(u'The Schmuck Stops Here',
|
|
||||||
u'http://www.baltimoresun.com/sports/schmuck-blog/rss2.0.xml'),
|
|
||||||
# (u'Tracking the Terps', u'http://weblogs.baltimoresun.com/sports/college/maryland_terps/blog/index.xml'),
|
|
||||||
# (u'Varsity Letters', u'http://weblogs.baltimoresun.com/sports/highschool/varsityletters/index.xml'),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
def get_article_url(self, article):
|
|
||||||
ans = None
|
|
||||||
try:
|
|
||||||
s = article.summary
|
|
||||||
ans = urllib.unquote(
|
|
||||||
re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
if ans is None:
|
|
||||||
ans = article.get('feedburner_origlink',
|
|
||||||
article.get('guid', article.get('link')))
|
|
||||||
if ans is not None:
|
|
||||||
return ans.replace('?track=rss', '')
|
|
||||||
|
|
||||||
def skip_ad_pages(self, soup):
|
|
||||||
text = soup.find(text='click here to continue to article')
|
|
||||||
if text:
|
|
||||||
a = text.parent
|
|
||||||
url = a.get('href')
|
|
||||||
if url:
|
|
||||||
return self.index_to_soup(url, raw=True)
|
|
||||||
|
@ -12,20 +12,24 @@ class DallasNews(BasicNewsRecipe):
|
|||||||
auto_cleanup = True
|
auto_cleanup = True
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
|
('News',
|
||||||
|
'http://www.dallasnews.com/news.rss'),
|
||||||
('Local News',
|
('Local News',
|
||||||
'http://www.dallasnews.com/news/politics/local-politics/?rss'),
|
'http://www.dallasnews.com/news/local-politics.rss'),
|
||||||
('National Politics',
|
|
||||||
'http://www.dallasnews.com/news/politics/national-politic/?rss'),
|
|
||||||
('State Politics',
|
('State Politics',
|
||||||
'http://www.dallasnews.com/news/politics/state-politics/?rss'),
|
'http://www.dallasnews.com/news/texas-politics.rss'),
|
||||||
('Religion',
|
('Religion',
|
||||||
'http://www.dallasnews.com/news/religion/?rss'),
|
'http://www.dallasnews.com/life/faith.rss'),
|
||||||
('Crime',
|
('Crime',
|
||||||
'http://www.dallasnews.com/news/crime/headlines/?rss'),
|
'http://www.dallasnews.com/news/crime.rss'),
|
||||||
('Celebrity News',
|
('Celebrity News',
|
||||||
'http://www.dallasnews.com/entertainment/celebrity-news/?rss&listname=TopStories'),
|
'http://www.dallasnews.com/entertainment/celebrity-news/?rss&listname=TopStories'),
|
||||||
('Nation',
|
('Business',
|
||||||
'http://www.dallasnews.com/news/nation-world/nation/?rss'),
|
'http://www.dallasnews.com/business.rss'),
|
||||||
('World',
|
('Arts',
|
||||||
'http://www.dallasnews.com/news/nation-world/world/?rss'),
|
'http://www.dallasnews.com/arts.rss'),
|
||||||
|
('Life',
|
||||||
|
'http://www.dallasnews.com/life.rss'),
|
||||||
|
('Opinion',
|
||||||
|
'http://www.dallasnews.com/opinion.rss'),
|
||||||
]
|
]
|
||||||
|
@ -18,12 +18,15 @@ articles_are_obfuscated = True
|
|||||||
|
|
||||||
class digiArts(BasicNewsRecipe):
|
class digiArts(BasicNewsRecipe):
|
||||||
__author__ = 'Lorenzo Vigentini'
|
__author__ = 'Lorenzo Vigentini'
|
||||||
description = 'Digital Arts - comprehensive coverage of the art of graphic design, 3D, animation, video, effects, web and interactive design, in print and online.' # noqa
|
description = ('Digital Arts - comprehensive coverage of the art of '
|
||||||
|
'graphic design, 3D, animation, video, effects, web and '
|
||||||
|
'interactive design, in print and online.') # noqa
|
||||||
cover_url = 'http://media.digitalartsonline.co.uk/graphics/logo_digital_arts.gif'
|
cover_url = 'http://media.digitalartsonline.co.uk/graphics/logo_digital_arts.gif'
|
||||||
|
|
||||||
title = 'Digital Arts Magazine '
|
title = 'Digital Arts Magazine '
|
||||||
publisher = 'IDG Communication'
|
publisher = 'IDG Communication'
|
||||||
category = 'Multimedia, photo, video, computing, product reviews, editing, cameras, production'
|
category = ('Multimedia, photo, video, computing, product reviews, '
|
||||||
|
'editing, cameras, production')
|
||||||
|
|
||||||
language = 'en'
|
language = 'en'
|
||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
@ -36,30 +39,22 @@ class digiArts(BasicNewsRecipe):
|
|||||||
|
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
auto_cleanup = False
|
||||||
def get_obfuscated_article(self, url):
|
|
||||||
br = self.get_browser()
|
|
||||||
br.open(url + '&print')
|
|
||||||
|
|
||||||
response = br.follow_link(url, nr=0)
|
|
||||||
html = response.read()
|
|
||||||
|
|
||||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
|
||||||
self.temp_files[-1].write(html)
|
|
||||||
self.temp_files[-1].close()
|
|
||||||
return self.temp_files[-1].name
|
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='div', attrs={'id': ['articleHeader', 'articleContent']})
|
dict(name='h1', attrs={'itemprop': 'headline'}),
|
||||||
|
dict(name='span', attrs={'itemprop': 'author'}),
|
||||||
|
dict(name='section', attrs={'class': 'articleBody'}),
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags = [
|
# Feed are found here: http://www.digitalartsonline.co.uk/rss/
|
||||||
dict(name='div', attrs={'class': ['submissionBar', 'mpuContainer']}),
|
|
||||||
dict(name='div', attrs={'id': ['articleSidebar', 'articleFooter']})
|
|
||||||
]
|
|
||||||
remove_tags_after = [
|
|
||||||
dict(name='p', attrs={'id': 'articlePageList'})
|
|
||||||
]
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Content', u'http://rss.feedsportal.com/c/662/f/8410/index.rss')
|
('Latest News Articles',
|
||||||
|
'http://www.digitalartsonline.co.uk/rss/feeds/digitalarts-news.xml'),
|
||||||
|
('Latest Tutorials',
|
||||||
|
'http://www.digitalartsonline.co.uk/rss/feeds/digitalarts-tutorials.xml'),
|
||||||
|
('Latest Reviews',
|
||||||
|
'http://www.digitalartsonline.co.uk/rss/feeds/digitalarts-reviews.xml'),
|
||||||
|
('Latest Features',
|
||||||
|
'http://www.digitalartsonline.co.uk/rss/feeds/digitalarts-features.xml'),
|
||||||
]
|
]
|
||||||
|
@ -112,23 +112,13 @@ class DiscoverMagazine(BasicNewsRecipe):
|
|||||||
return soup
|
return soup
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Technology', u'http://discovermagazine.com/topics/technology/rss.xml'),
|
(u'Technologiy', u'http://feeds.feedburner.com/DiscoverTechnology'),
|
||||||
(u'Health - Medicine',
|
(u'Health & Medicine', u'http://feeds.feedburner.com/DiscoverHealthMedicine'),
|
||||||
u'http://discovermagazine.com/topics/health-medicine/rss.xml'),
|
(u'Mind Brain', u'http://feeds.feedburner.com/DiscoverMindBrain'),
|
||||||
(u'Mind Brain', u'http://discovermagazine.com/topics/mind-brain/rss.xml'),
|
(u'Space & Physics', u'http://feeds.feedburner.com/DiscoverSpace'),
|
||||||
(u'Space', u'http://discovermagazine.com/topics/space/rss.xml'),
|
(u'Living World', u'http://feeds.feedburner.com/DiscoverLivingWorld'),
|
||||||
(u'Human Origins', u'http://discovermagazine.com/topics/human-origins/rss.xml'),
|
(u'Environment', u'http://feeds.feedburner.com/DiscoverEnvironment'),
|
||||||
(u'Living World', u'http://discovermagazine.com/topics/living-world/rss.xml'),
|
|
||||||
(u'Environment', u'http://discovermagazine.com/topics/environment/rss.xml'),
|
|
||||||
(u'Physics & Math', u'http://discovermagazine.com/topics/physics-math/rss.xml'),
|
|
||||||
(u"20 Things you didn't know about...",
|
(u"20 Things you didn't know about...",
|
||||||
u'http://discovermagazine.com/columns/20-things-you-didnt-know/rss.xml'),
|
u'http://feeds.feedburner.com/20ThingsYouDidntKnowAbout'),
|
||||||
(u'Fuzzy Math', u'http://discovermagazine.com/columns/fuzzy-math/rss.xml'),
|
(u'Vital Signs', u'http://feeds.feedburner.com/discovermagazine/VitalSigns'),
|
||||||
(u'The Brain', u'http://discovermagazine.com/columns/the-brain/rss.xml'),
|
|
||||||
(u'What is This', u'http://discovermagazine.com/columns/what-is-this/rss.xml'),
|
|
||||||
(u'Vital Signs', u'http://discovermagazine.com/columns/vital-signs/rss.xml'),
|
|
||||||
(u'Think Tech', u'http://discovermagazine.com/columns/think-tech/rss.xml'),
|
|
||||||
(u'Future Tech', u'http://discovermagazine.com/columns/future-tech/rss.xml'),
|
|
||||||
(u'Discover Interview',
|
|
||||||
u'http://discovermagazine.com/columns/discover-interview/rss.xml'),
|
|
||||||
]
|
]
|
||||||
|
@ -18,9 +18,12 @@ class EandP(BasicNewsRecipe):
|
|||||||
encoding = 'utf8'
|
encoding = 'utf8'
|
||||||
cover_url = 'http://www.editorandpublisher.com/images/EP_main_logo.gif'
|
cover_url = 'http://www.editorandpublisher.com/images/EP_main_logo.gif'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
html2lrf_options = [
|
html2lrf_options = [
|
||||||
'--comment', description, '--category', category, '--publisher', publisher
|
'--comment', description,
|
||||||
|
'--category', category,
|
||||||
|
'--publisher', publisher
|
||||||
]
|
]
|
||||||
|
|
||||||
html2epub_options = 'publisher="' + publisher + \
|
html2epub_options = 'publisher="' + publisher + \
|
||||||
@ -34,21 +37,11 @@ class EandP(BasicNewsRecipe):
|
|||||||
h2{font-size: large;}
|
h2{font-size: large;}
|
||||||
'''
|
'''
|
||||||
|
|
||||||
# Keep only div:itemmgap
|
|
||||||
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(name='div', attrs={'class': 'itemmgap'})
|
|
||||||
]
|
|
||||||
|
|
||||||
# Remove commenting/social media lins
|
# Remove commenting/social media lins
|
||||||
|
|
||||||
remove_tags_after = [dict(name='div', attrs={'class': 'clear'})]
|
remove_tags_after = [dict(name='div', attrs={'class': 'clear'})]
|
||||||
|
|
||||||
feeds = [(u'Breaking News', u'http://www.editorandpublisher.com/GenerateRssFeed.aspx'),
|
feeds = [
|
||||||
(u'Business News',
|
(u'Editor & Publisher', u'http://www.editorandpublisher.com/feed/'),
|
||||||
u'http://www.editorandpublisher.com/GenerateRssFeed.aspx?CategoryId=2'),
|
(u'Comments', u'http://www.editorandpublisher.com/comments/feed/'),
|
||||||
(u'Ad/Circ News', u'http://www.editorandpublisher.com/GenerateRssFeed.aspx?CategoryId=3'),
|
]
|
||||||
(u'Newsroom', u'http://www.editorandpublisher.com/GenerateRssFeed.aspx?CategoryId=4'),
|
|
||||||
(u'Technology News',
|
|
||||||
u'http://www.editorandpublisher.com/GenerateRssFeed.aspx?CategoryId=5'),
|
|
||||||
(u'Syndicates News', u'http://www.editorandpublisher.com/GenerateRssFeed.aspx?CategoryId=7')]
|
|
||||||
|
@ -5,32 +5,9 @@ class AdvancedUserRecipe1295088390(BasicNewsRecipe):
|
|||||||
title = u'Everett Herald'
|
title = u'Everett Herald'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
__author__ = '77ja65'
|
__author__ = '77ja65'
|
||||||
oldest_article = 4
|
oldest_article = 7
|
||||||
max_articles_per_feed = 50
|
max_articles_per_feed = 50
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
masthead_url = 'http://heraldnet.com/images/hnet/jQueryComponents/jQueryNavigation/heraldnet_logo.png'
|
auto_cleanup = True
|
||||||
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
|
|
||||||
|
|
||||||
feeds = [(u'Local News',
|
feeds = [(u'Local News', u'http://www.heraldnet.com/feed/')]
|
||||||
u'http://heraldnet.com/section/RSS02&mime=xml'),
|
|
||||||
(u'Sports', u'http://heraldnet.com/section/RSS04&mime=xml'),
|
|
||||||
(u'Entertainment',
|
|
||||||
u'http://heraldnet.com/section/RSS07&mime=xml'),
|
|
||||||
(u'Life', u'http://heraldnet.com/section/RSS03&mime=xml'),
|
|
||||||
(u'Breaking News',
|
|
||||||
u'http://heraldnet.com/section/RSS34&mime=xml'),
|
|
||||||
(u'Seahawks', u'http://heraldnet.com/section/RSS22&mime=xml'),
|
|
||||||
(u'HeraldNet', u'http://heraldnet.com/section/RSS01&mime=xml'),
|
|
||||||
(u'Inside Everett',
|
|
||||||
u'http://heraldnet.com/section/RSS26&mime=xml')
|
|
||||||
]
|
|
||||||
|
|
||||||
def print_version(self, url):
|
|
||||||
return url + "&template=PrinterFriendly"
|
|
||||||
|
|
||||||
extra_css = '''
|
|
||||||
h1{font-family:Arial,Helvetica,sans-serif; font-
|
|
||||||
weight:bold;font-size:large;}
|
|
||||||
h2{font-family:Arial,Helvetica,sans-serif; font-
|
|
||||||
weight:normal;font-size:small;}
|
|
||||||
'''
|
|
||||||
|
@ -6,101 +6,29 @@ class FairbanksDailyNewsminer(BasicNewsRecipe):
|
|||||||
__author__ = 'Roger'
|
__author__ = 'Roger'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
description = 'The voice of interior Alaska since 1903'
|
description = 'The voice of interior Alaska since 1903'
|
||||||
publisher = 'http://www.newsminer.com/'
|
publisher = 'http://www.newsminer.com/'
|
||||||
category = 'news, Alaska, Fairbanks'
|
category = 'news, Alaska, Fairbanks'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
|
||||||
# Make article titles, author and date bold, italic or small font.
|
|
||||||
# http://assets.matchbin.com/sites/635/stylesheets/newsminer.com.css
|
|
||||||
# (signature_line contains date, views, comments)
|
|
||||||
extra_css = '''
|
|
||||||
.story_item_headline { font-size: medium; font-weight: bold; }
|
|
||||||
.story_item_author { font-size: small; font-style:italic; }
|
|
||||||
.signature_line { font-size: small; }
|
|
||||||
'''
|
|
||||||
|
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
language = 'en'
|
language = 'en'
|
||||||
encoding = 'utf8'
|
encoding = 'utf8'
|
||||||
conversion_options = {'linearize_tables': True}
|
conversion_options = {'linearize_tables': True}
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
# TODO: The News-miner cover image seems a bit small. Can this be
|
|
||||||
# enlarged by 10-30%?
|
|
||||||
masthead_url = 'http://d2uh5w9wm14i0w.cloudfront.net/sites/635/assets/top_masthead_-_menu_pic.jpg'
|
|
||||||
|
|
||||||
# In order to omit seeing number of views, number of posts and the pipe
|
|
||||||
# symbol for divider after the title and date of the article, a regex or
|
|
||||||
# manual processing is needed to get just the "story_item_date updated"
|
|
||||||
# (which contains the date). Everything else on this line is pretty much not needed.
|
|
||||||
#
|
|
||||||
# Currently, you will see the following:
|
|
||||||
# | Aug 24, 2011 | 654 views | 6 | |
|
|
||||||
# (ie. 6 comments)
|
|
||||||
#
|
|
||||||
|
|
||||||
# The following was suggested, but it looks like I also need to define self & soup
|
|
||||||
# (as well as bring in extra soup depends?)
|
|
||||||
# date = self.tag_to_string(soup.find('span', attrs={'class':'story_item_date updated'}))
|
|
||||||
|
|
||||||
# preprocess_regexps = [(re.compile(r'<span[^>]*addthis_separator*>'), lambda match: '') ]
|
|
||||||
# preprocess_regexps = [(re.compile(r'span class="addthis_separator">|</span>'), lambda match: '') ]
|
|
||||||
|
|
||||||
# preprocess_regexps = [
|
|
||||||
# (re.compile(r'<start>.*?<end>', re.IGNORECASE | re.DOTALL), lambda match : ''),
|
|
||||||
# ]
|
|
||||||
|
|
||||||
# def get_browser(self):
|
|
||||||
# def preprocess_html(soup, first_fetch):
|
|
||||||
# date = self.tag_to_string(soup.find('span', attrs={'class':'story_item_date updated'}))
|
|
||||||
# return
|
|
||||||
|
|
||||||
# preprocess_regexps = [(re.compile(r' |.*?', re.DOTALL), lambda m: '')]
|
|
||||||
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(name='div', attrs={'class': 'story_item_headline entry-title'}),
|
|
||||||
dict(name='div', attrs={'class': 'full_story'})
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags = [
|
|
||||||
# Try getting rid of some signature_line (date line) stuff
|
|
||||||
dict(name='img', attrs={'class': 'dont_touch_me'}),
|
|
||||||
dict(name='span', attrs={
|
|
||||||
'class': 'number_recommendations'}),
|
|
||||||
|
|
||||||
# Removes div within <!-- AddThis Button BEGIN --> <!--
|
|
||||||
# AddThis Button END -->
|
|
||||||
dict(name='div', attrs={
|
|
||||||
'class': 'addthis_toolbox addthis_default_style'}),
|
|
||||||
|
|
||||||
dict(name='div', attrs={'class': 'related_content'}),
|
|
||||||
dict(name='div', attrs={'id': 'comments_container'})
|
|
||||||
]
|
|
||||||
|
|
||||||
# Comment-out or uncomment any of the following RSS feeds according to your
|
|
||||||
# liking.
|
|
||||||
#
|
|
||||||
# TODO: Some random bits of text might be trailing the last page (or TOC on
|
|
||||||
# MOBI files), these are bits of public posts and comments and need to also
|
|
||||||
# be removed.
|
|
||||||
#
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Alaska News', u'http://newsminer.com/rss/rss_feeds/alaska_news?content_type=article&tags=alaska_news&page_name=rss_feeds&instance=alaska_news'),
|
(u'Alaska News',
|
||||||
(u'Local News', u'http://newsminer.com/rss/rss_feeds/local_news?content_type=article&tags=local_news&page_name=rss_feeds&offset=0&instance=local_news'),
|
u'http://www.newsminer.com/search/?f=rss&t=article&c=news/alaska_news&l=50&s=start_time&sd=desc'),
|
||||||
(u'Business', u'http://newsminer.com/rss/rss_feeds/business_news?content_type=article&tags=business_news&page_name=rss_feeds&instance=business_news'),
|
(u'Local News',
|
||||||
(u'Politics', u'http://newsminer.com/rss/rss_feeds/politics_news?content_type=article&tags=politics_news&page_name=rss_feeds&instance=politics_news'),
|
u'http://www.newsminer.com/search/?f=rss&t=article&c=news/local_news&l=50&s=start_time&sd=desc'),
|
||||||
(u'Sports', u'http://newsminer.com/rss/rss_feeds/sports_news?content_type=article&tags=sports_news&page_name=rss_feeds&instance=sports_news'),
|
(u'Business',
|
||||||
(u'Latitude 65 feed', u'http://newsminer.com/rss/rss_feeds/latitude_65?content_type=article&tags=latitude_65&page_name=rss_feeds&offset=0&instance=latitude_65'), # noqa
|
u'http://www.newsminer.com/search/?f=rss&t=article&c=business&l=50&s=start_time&sd=desc'),
|
||||||
# (u'Sundays', u'http://newsminer.com/rss/rss_feeds/Sundays?content_type=article&tags=alaska_science_forum+scott_mccrea+interior_gardening+in_the_bush+judy_ferguson+book_reviews+theresa_bakker+judith_kleinfeld+interior_scrapbook+nuggets_comics+freeze_frame&page_name=rss_feeds&tag_inclusion=or&instance=Sundays'), # noqa
|
(u'Politics',
|
||||||
(u'Outdoors', u'http://newsminer.com/rss/rss_feeds/Outdoors?content_type=article&tags=outdoors&page_name=rss_feeds&instance=Outdoors'),
|
u'http://www.newsminer.com/search/?f=rss&t=article&c=news/politics&l=50&s=start_time&sd=desc'),
|
||||||
# (u'Fairbanks Grizzlies', u'http://newsminer.com/rss/rss_feeds/fairbanks_grizzlies?content_type=article&tags=fairbanks_grizzlies&page_name=rss_feeds&instance=fairbanks_grizzlies'), # noqa
|
(u'Sports',
|
||||||
# (u'Newsminer', u'http://newsminer.com/rss/rss_feeds/Newsminer?content_type=article&tags=ted_stevens_bullets+ted_stevens+sports_news+business_news+fairbanks_grizzlies+dermot_cole_column+outdoors+alaska_science_forum+scott_mccrea+interior_gardening+in_the_bush+judy_ferguson+book_reviews+theresa_bakker+judith_kleinfeld+interior_scrapbook+nuggets_comics+freeze_frame&page_name=rss_feeds&tag_inclusion=or&instance=Newsminer'), # noqa
|
u'http://www.newsminer.com/search/?f=rss&t=article&c=sports&l=50&s=start_time&sd=desc'),
|
||||||
(u'Opinion', u'http://newsminer.com/rss/rss_feeds/Opinion?content_type=article&tags=editorials&page_name=rss_feeds&instance=Opinion'),
|
(u'Opinion',
|
||||||
(u'Youth', u'http://newsminer.com/rss/rss_feeds/Youth?content_type=article&tags=youth&page_name=rss_feeds&instance=Youth'),
|
u'http://www.newsminer.com/search/?f=rss&t=article&c=opinion&l=50&s=start_time&sd=desc'),
|
||||||
# (u'Dermot Cole Blog', u'http://newsminer.com/rss/rss_feeds/dermot_cole_blog+rss?content_type=blog+entry&sort_by=posted_on&user_ids=3015275&page_name=blogs_dermot_cole&limit=10&instance=dermot_cole_blog+rss'), # noqa
|
|
||||||
(u'Dermot Cole Column', u'http://newsminer.com/rss/rss_feeds/Dermot_Cole_column?content_type=article&tags=dermot_cole_column&page_name=rss_feeds&instance=Dermot_Cole_column'), # noqa
|
|
||||||
# (u'Sarah Palin', u'http://newsminer.com/rss/rss_feeds/sarah_palin?content_type=article&tags=palin_in_the_news+palin_on_the_issues&page_name=rss_feeds&tag_inclusion=or&instance=sarah_palin') # noqa
|
|
||||||
]
|
]
|
||||||
|
@ -20,6 +20,7 @@ class FanGraphs(BasicNewsRecipe):
|
|||||||
category = 'Baseball'
|
category = 'Baseball'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
publication_type = 'Blog'
|
publication_type = 'Blog'
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
description = 'Baseball statistical analysis, graphs, and projections.'
|
description = 'Baseball statistical analysis, graphs, and projections.'
|
||||||
__author__ = 'David Appelman'
|
__author__ = 'David Appelman'
|
||||||
@ -27,9 +28,8 @@ class FanGraphs(BasicNewsRecipe):
|
|||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Fangraphs', u'http://feeds.feedburner.com/FanGraphs?format=xml'),
|
(u'Fangraphs', u'http://feeds.feedburner.com/FanGraphs?format=xml'),
|
||||||
(u'Rotographs', u'http://www.wizardrss.com/feed/feeds.feedburner.com/RotoGraphs?format=xml'),
|
(u'Rotographs', u'http://feeds.feedburner.com/RotoGraphs?format=xml'),
|
||||||
(u'Community', u'http://www.wizardrss.com/feed/www.fangraphs.com/community/?feed=rss2'),
|
(u'NotGraphs', u'http://feeds.feedburner.com/NotGraphs?format=xml')]
|
||||||
(u'NotGraphs', u'http://www.wizardrss.com/feed/www.fangraphs.com/not/?feed=rss2')]
|
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||||
|
@ -10,37 +10,27 @@ class AdvancedUserRecipe1305547242(BasicNewsRecipe):
|
|||||||
language = 'en'
|
language = 'en'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
__author__ = 'Anonymous'
|
__author__ = 'Anonymous'
|
||||||
remove_tags = [
|
auto_cleanup = True
|
||||||
dict(name='div', attrs={'class': 'articles_footer', 'class': 'printoptions'})]
|
|
||||||
|
|
||||||
def print_version(self, url):
|
feeds = [
|
||||||
return url + '?printable=true'
|
(u'All Fashion',
|
||||||
|
u'http://feeds.glamour.com/glamour/all_fashion'),
|
||||||
def preprocess_html(self, soup):
|
(u'All Beauty',
|
||||||
for alink in soup.findAll('a'):
|
u'http://feeds.glamour.com/glamour/all_beauty'),
|
||||||
if alink.string is not None:
|
|
||||||
tstr = alink.string
|
|
||||||
alink.replaceWith(tstr)
|
|
||||||
return soup
|
|
||||||
|
|
||||||
feeds = [ (u'All Fashion', u'http://feeds.glamour.com/glamour/all_fashion'),
|
|
||||||
(u'All Beauty', u'http://feeds.glamour.com/glamour/all_beauty'),
|
|
||||||
(u'All Sex, Love & Life',
|
(u'All Sex, Love & Life',
|
||||||
u'http://feeds.glamour.com/glamour/sex_love_life'),
|
u'http://feeds.glamour.com/glamour/sex_love_life'),
|
||||||
(u'All Health & Fitness',
|
(u'All Health & Fitness',
|
||||||
u'http://feeds.glamour.com/glamour/health_fitness'),
|
u'http://feeds.glamour.com/glamour/health_fitness'),
|
||||||
(u'Shopping', u'http://feeds.glamour.com/glamour/shopping'),
|
|
||||||
(u'Slaves to Fashion blog',
|
(u'Slaves to Fashion blog',
|
||||||
u'http://feeds.glamour.com/glamour/slavestofashion'),
|
u'http://feeds.glamour.com/glamour/slavestofashion'),
|
||||||
(u'The Girls in the Beauty Department',
|
(u'The Girls in the Beauty Department',
|
||||||
u'http://feeds.glamour.com/glamour/thegirlsinthebeautydepartment'),
|
u'http://feeds.glamour.com/glamour/thegirlsinthebeautydepartment'),
|
||||||
(u'Smitten blog', u'http://feeds.glamour.com/glamour/smitten'),
|
(u'Smitten blog',
|
||||||
(u'Save the Date', u'http://feeds.feedburner.com/glamour/save-the-date'),
|
u'http://feeds.glamour.com/glamour/smitten'),
|
||||||
(u'Single-ish blog', u'http://feeds.glamour.com/glamour/glamoursingle-ish'),
|
(u'Save the Date',
|
||||||
(u'Save the Date', u'http://feeds.feedburner.com/glamour/save-the-date'),
|
u'http://feeds.feedburner.com/glamour/save-the-date'),
|
||||||
(u'Vitamin G blog', u'http://feeds.glamour.com/glamour/vitamin-g'),
|
(u'Save the Date',
|
||||||
(u'Margarita Shapes Up blog',
|
u'http://feeds.feedburner.com/glamour/save-the-date'),
|
||||||
u'http://feeds.glamour.com/glamour/margaritashapesup'),
|
(u'Vitamin G blog',
|
||||||
(u'Little Miss Fortune blog',
|
u'http://feeds.glamour.com/glamour/vitamin-g'),
|
||||||
u'http://feeds.glamour.com/glamour/little-miss-fortune'),
|
|
||||||
]
|
]
|
||||||
|
@ -19,29 +19,19 @@ class NewsandRecord(BasicNewsRecipe):
|
|||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
|
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
|
||||||
}
|
}
|
||||||
|
|
||||||
remove_tags_before = dict(name='h3', attrs={'class': 'nrcTxt_headline'})
|
|
||||||
remove_tags_after = dict(name='div', attrs={'id': 'nrcBlk_ContentBody'})
|
|
||||||
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='iframe'),
|
|
||||||
dict(name=['notags', 'embed', 'object', 'link', 'img']),
|
|
||||||
|
|
||||||
]
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
('News', 'http://www.news-record.com/news/archive/feed'),
|
('News', 'http://www.greensboro.com/search/?q=&t=article&l=10&d=&d1=&d2=&s=start_time&sd=desc&c[]=news,news/*&f=rss'),
|
||||||
('Greensboro News', 'http://www.news-record.com/news/greensboro/feed'),
|
('Greensboro News', 'http://www.greensboro.com/search/?q=&t=article&l=100&d=&d1=&d2=&s=start_time&sd=desc&nsa=eedition&c[]=news/local,news/crime,news/goverment,news/schools,news/rockingham_county,news/local,news/crime,news/goverment,news/schools,news/rockingham_county/*&f=rss'),
|
||||||
('Education', 'http://www.news-record.com/news/education/feed'),
|
('Business', 'http://www.greensboro.com/search/?q=&t=article&l=10&d=&d1=&d2=&s=start_time&sd=desc&c[]=business,business/*&f=rss'),
|
||||||
('Government', 'http://www.news-record.com/news/government/feed'),
|
('Local Business', 'http://www.greensboro.com/search/?q=&t=article&l=100&d=&d1=&d2=&s=start_time&sd=desc&nsa=eedition&c[]=business/local_business,business/local_business/*&f=rss'),
|
||||||
('College Sports', 'http://www.news-record.com/sports/college/feed'),
|
('Sports', 'http://www.greensboro.com/search/?q=&t=article&l=10&d=&d1=&d2=&s=start_time&sd=desc&c[]=sports,sports/*&f=rss'),
|
||||||
('Sports Extra', 'http://www.news-record.com/blog/sportsextra/feed'),
|
('College Sports', 'http://www.greensboro.com/search/?q=&t=article&l=100&d=&d1=&d2=&s=start_time&sd=desc&nsa=eedition&c[]=sports/college,sports/college/*&f=rss'),
|
||||||
('Life', 'http://www.news-record.com/life/top/feed'),
|
('Sports Extra', 'http://www.greensboro.com/search/?q=&t=article&l=100&d=&d1=&d2=&s=start_time&sd=desc&nsa=eedition&c[]=blogs/sports_extra,blogs/sports_extra/*&f=rss'),
|
||||||
('NASCAR', 'http://www.news-record.com/sports/nascar/top/feed'),
|
('Life', 'http://www.greensboro.com/search/?q=&t=article&l=10&d=&d1=&d2=&s=start_time&sd=desc&c[]=life,life/*&f=rss'),
|
||||||
('Editorials', 'http://www.news-record.com/opinion/editorials/feed'),
|
|
||||||
('Letters to the Editor', 'http://www.news-record.com/opinion/letters/feed')
|
|
||||||
]
|
]
|
||||||
|
@ -12,36 +12,11 @@ class ChicagoTribune(BasicNewsRecipe):
|
|||||||
__author__ = 'Being and Sujata Raman'
|
__author__ = 'Being and Sujata Raman'
|
||||||
description = 'Politics, local and business news from Hartford'
|
description = 'Politics, local and business news from Hartford'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'class': ["story", "entry-asset asset hentry"]}),
|
|
||||||
dict(name='div', attrs={
|
|
||||||
'id': ["pagebody", "story", "maincontentcontainer"]}),
|
|
||||||
]
|
|
||||||
remove_tags_after = [{'class': ['photo_article', ]}]
|
|
||||||
|
|
||||||
remove_tags = [
|
|
||||||
{'id': ["moduleArticleTools", "content-bottom", "rail", "articleRelates module", "toolSet", "relatedrailcontent", "div-wrapper", "beta", "atp-comments", "footer"]}, # noqa
|
|
||||||
{'class': ["clearfix", "relatedTitle", "articleRelates module", "asset-footer", "tools", "comments",
|
|
||||||
"featurePromo", "featurePromo fp-topjobs brownBackground", "clearfix fullSpan brownBackground", "curvedContent"]},
|
|
||||||
dict(name='font', attrs={'id': ["cr-other-headlines"]})]
|
|
||||||
extra_css = '''
|
|
||||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
|
||||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
|
||||||
.byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
|
|
||||||
.date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
|
|
||||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
|
||||||
.copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center}
|
|
||||||
.story{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
|
||||||
.entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
|
||||||
.pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
|
||||||
.maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
|
||||||
.story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
|
||||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
|
||||||
'''
|
|
||||||
feeds = [
|
feeds = [
|
||||||
('Breaking News', 'http://feeds.feedburner.com/courant-breaking-news/'),
|
('Breaking News', 'http://feeds.feedburner.com/courant-breaking-news/'),
|
||||||
('Nation/World News', 'http://feeds.feedburner.com/courant-nation-world/'),
|
('Nation/World News', 'http://feeds.feedburner.com/courant-nation-world/'),
|
||||||
@ -64,29 +39,9 @@ class ChicagoTribune(BasicNewsRecipe):
|
|||||||
('Music', 'http://feeds.feedburner.com/courant-music/'),
|
('Music', 'http://feeds.feedburner.com/courant-music/'),
|
||||||
('TV', 'http://feeds.feedburner.com/courant-tv/'),
|
('TV', 'http://feeds.feedburner.com/courant-tv/'),
|
||||||
('Movies', 'http://feeds.feedburner.com/courant-movies/'),
|
('Movies', 'http://feeds.feedburner.com/courant-movies/'),
|
||||||
# ('Metromix headlines', 'http://feeds.feedburner.com/metromix/topheadlines/'),
|
|
||||||
# ('Metromix events', 'http://feeds.feedburner.com/metromix/events/'),
|
|
||||||
# ('Metromix restaurants', 'http://feeds.feedburner.com/metromix/restaurants/'),
|
|
||||||
('Outdoors', 'http://feeds.feedburner.com/courant-outdoors/'),
|
('Outdoors', 'http://feeds.feedburner.com/courant-outdoors/'),
|
||||||
('Peter Marteka', 'http://feeds.feedburner.com/courant-marteka-column/'),
|
('Peter Marteka', 'http://feeds.feedburner.com/courant-marteka-column/'),
|
||||||
('Susan Campbell', 'http://feeds.feedburner.com/courant-campbell-column/'),
|
|
||||||
('Helen Ubinas', 'http://feeds.feedburner.com/courant-helen-ubinas-column/'),
|
|
||||||
('Jim Shea', 'http://feeds.feedburner.com/courant-jim-shea-column/'),
|
('Jim Shea', 'http://feeds.feedburner.com/courant-jim-shea-column/'),
|
||||||
('Tom Condon', 'http://feeds.feedburner.com/courant-tom-condon-column/'),
|
('Tom Condon', 'http://feeds.feedburner.com/courant-tom-condon-column/'),
|
||||||
('Colin McEnroe', 'http://feeds.feedburner.com/courant-colin-mcenroe-column/'),
|
('Colin McEnroe', 'http://feeds.feedburner.com/courant-colin-mcenroe-column/'),
|
||||||
]
|
]
|
||||||
|
|
||||||
def get_article_url(self, article):
|
|
||||||
print article.get('feedburner_origlink', article.get('guid', article.get('link')))
|
|
||||||
return article.get('feedburner_origlink', article.get('guid', article.get('link')))
|
|
||||||
|
|
||||||
def postprocess_html(self, soup, first_fetch):
|
|
||||||
for t in soup.findAll(['table', 'tr', 'td']):
|
|
||||||
t.name = 'div'
|
|
||||||
|
|
||||||
for tag in soup.findAll('form', dict(attrs={'name': ["comments_form"]})):
|
|
||||||
tag.extract()
|
|
||||||
for tag in soup.findAll('font', dict(attrs={'id': ["cr-other-headlines"]})):
|
|
||||||
tag.extract()
|
|
||||||
|
|
||||||
return soup
|
|
||||||
|
BIN
recipes/icons/digital_arts.png
Normal file
BIN
recipes/icons/digital_arts.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 4.7 KiB |
BIN
recipes/icons/heritage_foundation.png
Normal file
BIN
recipes/icons/heritage_foundation.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 459 B |
Loading…
x
Reference in New Issue
Block a user