Fixed varous recipes that had broken/dead feed links.

This commit is contained in:
Allan Simonsen 2016-10-09 10:50:14 +02:00
parent d2eb1426b0
commit 445955a537
13 changed files with 117 additions and 383 deletions

View File

@ -24,65 +24,39 @@ class BaltimoreSun(BasicNewsRecipe):
no_stylesheets = True
remove_javascript = True
remove_empty_feeds = True
auto_cleanup = False
ignore_duplicate_articles = {'title'}
keep_only_tags = [
dict(name=['div', 'section'], attrs={'class': [
"trb_article_title", "trb_article_leadart", 'trb_bylines', 'trb_article_dateline', 'trb_mainContent']}),
dict(name=['div'], attrs={'class': ['trb_ar_hl', 'trb_ar_hl_t',
'trb_ar_la', 'trb_ar_by',
'trb_ar_dateline', 'trb_ar_page']}),
]
remove_tags = [
dict(name=['meta', 'link']),
dict(name=['div', 'aside'], attrs={'class': lambda x: x and set(x.split()).intersection({
'trb_gptAd', 'trb_panelmod_container', 'trb_socialize', 'trb_taboola', 'trb_embed_related'})}),
]
def preprocess_html(self, soup):
for img in soup.findAll('img'):
img['src'] = img['data-baseurl']
return soup
feeds = [
# News ##
(u'Top Headlines', u'http://feeds.feedburner.com/baltimoresun/news/rss2'),
(u'Breaking News', u'http://feeds.feedburner.com/baltimoresun/news/local/annearundel/rss2'),
(u'Top Maryland', u'http://feeds.feedburner.com/baltimoresun/news/local/rss2'),
# (u'Anne Arundel County', u'http://feeds.feedburner.com/baltimoresun/news/local/annearundel/rss2'),
(u'Baltimore City', u'http://feeds.feedburner.com/baltimoresun/news/local/baltimore_city/rss20xml'),
# (u'Baltimore County', u'http://feeds.feedburner.com/baltimoresun/news/local/baltimore_county/rss2'),
# (u'Carroll County', u'http://feeds.feedburner.com/baltimoresun/news/local/carroll/rss2'),
# (u'Harford County', u'http://feeds.feedburner.com/baltimoresun/news/local/harford/rss2),
# (u'Howard County', u'http://feeds.feedburner.com/baltimoresun/news/local/howard/rss2'),
(u'Education', u'http://feeds.feedburner.com/baltimoresun/news/education/rss2'),
# (u'Obituaries', u'http://feeds.feedburner.com/baltimoresun/news/obituaries/rss2'),
(u'Local Politics',
u'http://feeds.feedburner.com/baltimoresun/news/local/politics/rss2'),
(u'Local Politics', u'http://feeds.feedburner.com/baltimoresun/news/local/politics/rss2'),
(u'Weather', u'http://feeds.feedburner.com/baltimoresun/news/weather/site/rss2'),
# (u'Traffic', u'http://feeds.feedburner.com/baltimoresun/news/traffic/rss2'),
(u'Nation/world', u'http://feeds.feedburner.com/baltimoresun/news/nationworld/rss2'),
# (u'Weird News', u'http://feeds.feedburner.com/baltsun-weirdnews'),
# Sports##
(u'Top Sports', u'http://feeds.feedburner.com/baltimoresun/sports/rss2'),
(u'Orioles/Baseball', u'http://www.baltimoresun.com/sports/orioles/rss2.0.xml'),
(u'Ravens/Football',
u'http://feeds.feedburner.com/baltimoresun/sports/football/rss2'),
# (u'Terps', u''http://feeds.feedburner.com/baltimoresun/sports/terps/rss2'),
# (u'College Football', u''feed://feeds.feedburner.com/baltimoresun/sports/college/football/rss2'),
# (u'Lacrosse', u'http://feeds.feedburner.com/baltimoresun/sports/college/lacrosse/rss2'),
# (u'Horse Racing', u'http://feeds.feedburner.com/baltimoresun/sports/horseracing/rss2'),
# (u'Golf', u'http://feeds.feedburner.com/baltimoresun/sports/golf/rss2'),
# (u'NBA', u'http://feeds.feedburner.com/baltimoresun/sports/basketball/rss2'),
# (u'High School', u'http://feeds.feedburner.com/baltimoresun/sports/highschool/rss2'),
# (u'Outdoors', u'http://feeds.feedburner.com/baltimoresun/sports/outdoors/rss2'),
(u'Ravens/Football', u'http://feeds.feedburner.com/baltimoresun/sports/football/rss2'),
# Entertainment ##
(u'Celebrity News', u'http://baltimore.feedsportal.com/c/34255/f/623042/index.rss'),
(u'Arts & Theater', u'http://feeds.feedburner.com/baltimoresun/entertainment/galleriesmuseums/rss2'),
(u'Movies', u'http://www.baltimoresun.com/entertainment/movies/rss2.0.xml'),
(u'Music & Nightlife',
u'http://www.baltimoresun.com/entertainment/music/rss2.0.xml'),
(u'Restaurants & Food',
u'http://www.baltimoresun.com/entertainment/dining/rss2.0.xml'),
(u'Music & Nightlife', u'http://www.baltimoresun.com/entertainment/music/rss2.0.xml'),
(u'Restaurants & Food', u'http://www.baltimoresun.com/entertainment/dining/rss2.0.xml'),
(u'TV/Media', u'http://www.baltimoresun.com/entertainment/tv/rss2.0.xml'),
# Life ##
@ -91,104 +65,42 @@ class BaltimoreSun(BasicNewsRecipe):
(u'Living Green', u'http://www.baltimoresun.com/features/green/rss2.0.xml'),
(u'Fashion', u'http://www.baltimoresun.com/features/fashion/rss2.0.xml'),
(u'Travel', u'http://www.baltimoresun.com/travel/rss2.0.xml'),
# (u'Faith', u'http://www.baltimoresun.com/features/faith/rss2.0.xml'),
# Business ##
(u'Top Business', u'http://www.baltimoresun.com/business/rss2.0.xml'),
(u'Technology', u'http://www.baltimoresun.com/business/technology/rss2.0.xml'),
(u'Personal finance', u'http://baltimore.feedsportal.com/c/34255/f/623057/index.rss'),
(u'Real Estate', u'http://www.baltimoresun.com/classified/realestate/rss2.0.xml'),
(u'Jobs', u'http://baltimore.feedsportal.com/c/34255/f/623059/index.rss'),
# (u'DIY', u'http://baltimore.feedsportal.com/c/34255/f/623060/index.rss'),
# (u'Consumer Safety', u'http://baltimore.feedsportal.com/c/34255/f/623061/index.rss'),
(u'Investing', u'http://www.baltimoresun.com/business/money/rss2.0.xml'),
# Opinion##
(u'Sun Editorials', u'http://www.baltimoresun.com/news/opinion/editorial/rss2.0.xml'),
(u'Op/Ed', u'http://www.baltimoresun.com/news/opinion/oped/rss2.0.xml'),
(u'Readers Respond', u'http://baltimore.feedsportal.com/c/34255/f/623065/index.rss'),
# Columnists ##
(u'Kevin Cowherd', u'http://www.baltimoresun.com/sports/bal-columnist-cowherd,0,6829726.columnist-rss2.0.xml'),
(u'Robert Ehrlich', u'http://www.baltimoresun.com/news/opinion/columnists/bal-columnist-ehrlich,0,1825227.columnist-rss2.0.xml'),
(u'Jacques Kelly', u'http://www.baltimoresun.com/news/maryland/bal-columnist-kelly,0,1154701.columnist-rss2.0.xml'),
(u'Marta H. Mossburg', u'http://www.baltimoresun.com/news/opinion/oped/bal-columnist-mossburg,0,7982155.columnist-rss2.0.xml'),
(u'Mike Preston', u'http://www.baltimoresun.com/sports/bal-columnist-preston,0,6169796.columnist-rss2.0.xml'),
(u'Susan Reimer', u'http://www.baltimoresun.com/news/opinion/bal-columnist-reimer,0,162466.columnist-rss2.0.xml'),
(u'Dan Rodricks', u'http://www.baltimoresun.com/news/maryland/bal-columnist-rodricks,0,7089843.columnist-rss2.0.xml'),
(u'Thomas F. Schaller', u'http://www.baltimoresun.com/news/opinion/columnists/bal-columnist-schaller,0,897397.columnist-rss2.0.xml'),
(u'Peter Schmuck', u'http://www.baltimoresun.com/sports/bal-columnist-schmuck,0,7485088.columnist-rss2.0.xml'),
# News Blogs ##
(u'Baltimore Crime Beat',
u'http://baltimore.feedsportal.com/c/34255/f/623075/index.rss'),
(u'InsideEd', u'http://www.baltimoresun.com/news/maryland/education/blog/rss2.0.xml'),
(u'Maryland Politics',
u'http://www.baltimoresun.com/news/maryland/politics/blog/rss2.0.xml'),
(u'Maryland Weather',
u'http://www.baltimoresun.com/news/weather/weather-blog/rss2.0.xml'),
(u'Second Opinion',
u'http://www.baltimoresun.com/news/opinion/second-opinion-blog/rss2.0.xml'),
(u'Sun Investigates',
u'http://www.baltimoresun.com/news/maryland/sun-investigates/rss2.0.xml'),
(u'Maryland Politics', u'http://www.baltimoresun.com/news/maryland/politics/blog/rss2.0.xml'),
(u'Maryland Weather', u'http://www.baltimoresun.com/news/weather/weather-blog/rss2.0.xml'),
(u'Second Opinion', u'http://www.baltimoresun.com/news/opinion/second-opinion-blog/rss2.0.xml'),
(u'Sun Investigates', u'http://www.baltimoresun.com/news/maryland/sun-investigates/rss2.0.xml'),
(u'You Dont Say', u'http://www.baltimoresun.com/news/language-blog/rss2.0.xml'),
# Business Blogs ##
(u'BaltTech', u'http://www.baltimoresun.com/business/technology/blog/rss2.0.xml'),
(u'Consuming Interests',
u'http://www.baltimoresun.com/business/consuming-interests-blog/rss2.0.xml'),
(u'The Real Estate Wonk',
u'http://www.baltimoresun.com/business/real-estate/wonk/rss2.0.xml'),
(u'Consuming Interests', u'http://www.baltimoresun.com/business/consuming-interests-blog/rss2.0.xml'),
(u'The Real Estate Wonk', u'http://www.baltimoresun.com/business/real-estate/wonk/rss2.0.xml'),
# Entertainment Blogs ##
(u'ArtSmash', 'http://www.baltimoresun.com/entertainment/arts/artsmash/rss2.0.xml'),
(u'Baltimore Diner', u'http://baltimore.feedsportal.com/c/34255/f/623088/index.rss'),
(u'Midnight Sun', u'http://www.baltimoresun.com/entertainment/music/midnight-sun-blog/rss2.0.xml'),
(u'Read Street', u'http://www.baltimoresun.com/features/books/read-street/rss2.0.xml'),
(u'Z on TV', u'http://www.baltimoresun.com/entertainment/tv/z-on-tv-blog/rss2.0.xml'),
# Life Blogs ##
# (u'BMore Green', u'http://weblogs.baltimoresun.com/features/green/index.xml'),
(u'Baltimore Insider',
u'http://www.baltimoresun.com/features/baltimore-insider-blog/rss2.0.xml'),
(u'Baltimore Insider', u'http://www.baltimoresun.com/features/baltimore-insider-blog/rss2.0.xml'),
(u'Picture of Health', u'http://www.baltimoresun.com/health/blog/rss2.0.xml'),
# (u'Unleashed', u'http://weblogs.baltimoresun.com/features/mutts/blog/index.xml'),
# b the site blogs ##
(u'TV Lust', u'http://baltimore.feedsportal.com/c/34255/f/623096/index.rss'),
# Sports Blogs ##
(u'Baltimore Sports Blitz',
u'http://baltimore.feedsportal.com/c/34255/f/623097/index.rss'),
# (u'Lacrosse Insider',u'http://www.baltimoresun.com/sports/lacrosse-blog/rss2.0.xml'),
(u'Orioles Insider', u'http://baltimore.feedsportal.com/c/34255/f/623100/index.rss'),
(u'Ravens Insider',
u'http://www.baltimoresun.com/sports/ravens/ravens-insider/rss2.0.xml'),
# (u'Ring Posts', u'http://weblogs.baltimoresun.com/sports/wrestling/blog/index.xml'),
(u'The Schmuck Stops Here',
u'http://www.baltimoresun.com/sports/schmuck-blog/rss2.0.xml'),
# (u'Tracking the Terps', u'http://weblogs.baltimoresun.com/sports/college/maryland_terps/blog/index.xml'),
# (u'Varsity Letters', u'http://weblogs.baltimoresun.com/sports/highschool/varsityletters/index.xml'),
(u'Ravens Insider', u'http://www.baltimoresun.com/sports/ravens/ravens-insider/rss2.0.xml'),
(u'The Schmuck Stops Here', u'http://www.baltimoresun.com/sports/schmuck-blog/rss2.0.xml'),
]
def get_article_url(self, article):
ans = None
try:
s = article.summary
ans = urllib.unquote(
re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
except:
pass
if ans is None:
ans = article.get('feedburner_origlink',
article.get('guid', article.get('link')))
if ans is not None:
return ans.replace('?track=rss', '')
def skip_ad_pages(self, soup):
text = soup.find(text='click here to continue to article')
if text:
a = text.parent
url = a.get('href')
if url:
return self.index_to_soup(url, raw=True)

View File

@ -12,20 +12,24 @@ class DallasNews(BasicNewsRecipe):
auto_cleanup = True
feeds = [
('News',
'http://www.dallasnews.com/news.rss'),
('Local News',
'http://www.dallasnews.com/news/politics/local-politics/?rss'),
('National Politics',
'http://www.dallasnews.com/news/politics/national-politic/?rss'),
'http://www.dallasnews.com/news/local-politics.rss'),
('State Politics',
'http://www.dallasnews.com/news/politics/state-politics/?rss'),
'http://www.dallasnews.com/news/texas-politics.rss'),
('Religion',
'http://www.dallasnews.com/news/religion/?rss'),
'http://www.dallasnews.com/life/faith.rss'),
('Crime',
'http://www.dallasnews.com/news/crime/headlines/?rss'),
'http://www.dallasnews.com/news/crime.rss'),
('Celebrity News',
'http://www.dallasnews.com/entertainment/celebrity-news/?rss&listname=TopStories'),
('Nation',
'http://www.dallasnews.com/news/nation-world/nation/?rss'),
('World',
'http://www.dallasnews.com/news/nation-world/world/?rss'),
('Business',
'http://www.dallasnews.com/business.rss'),
('Arts',
'http://www.dallasnews.com/arts.rss'),
('Life',
'http://www.dallasnews.com/life.rss'),
('Opinion',
'http://www.dallasnews.com/opinion.rss'),
]

View File

@ -18,12 +18,15 @@ articles_are_obfuscated = True
class digiArts(BasicNewsRecipe):
__author__ = 'Lorenzo Vigentini'
description = 'Digital Arts - comprehensive coverage of the art of graphic design, 3D, animation, video, effects, web and interactive design, in print and online.' # noqa
description = ('Digital Arts - comprehensive coverage of the art of '
'graphic design, 3D, animation, video, effects, web and '
'interactive design, in print and online.') # noqa
cover_url = 'http://media.digitalartsonline.co.uk/graphics/logo_digital_arts.gif'
title = 'Digital Arts Magazine '
publisher = 'IDG Communication'
category = 'Multimedia, photo, video, computing, product reviews, editing, cameras, production'
category = ('Multimedia, photo, video, computing, product reviews, '
'editing, cameras, production')
language = 'en'
encoding = 'cp1252'
@ -36,30 +39,22 @@ class digiArts(BasicNewsRecipe):
remove_javascript = True
no_stylesheets = True
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url + '&print')
response = br.follow_link(url, nr=0)
html = response.read()
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name
auto_cleanup = False
keep_only_tags = [
dict(name='div', attrs={'id': ['articleHeader', 'articleContent']})
dict(name='h1', attrs={'itemprop': 'headline'}),
dict(name='span', attrs={'itemprop': 'author'}),
dict(name='section', attrs={'class': 'articleBody'}),
]
remove_tags = [
dict(name='div', attrs={'class': ['submissionBar', 'mpuContainer']}),
dict(name='div', attrs={'id': ['articleSidebar', 'articleFooter']})
]
remove_tags_after = [
dict(name='p', attrs={'id': 'articlePageList'})
]
# Feed are found here: http://www.digitalartsonline.co.uk/rss/
feeds = [
(u'Content', u'http://rss.feedsportal.com/c/662/f/8410/index.rss')
('Latest News Articles',
'http://www.digitalartsonline.co.uk/rss/feeds/digitalarts-news.xml'),
('Latest Tutorials',
'http://www.digitalartsonline.co.uk/rss/feeds/digitalarts-tutorials.xml'),
('Latest Reviews',
'http://www.digitalartsonline.co.uk/rss/feeds/digitalarts-reviews.xml'),
('Latest Features',
'http://www.digitalartsonline.co.uk/rss/feeds/digitalarts-features.xml'),
]

View File

@ -112,23 +112,13 @@ class DiscoverMagazine(BasicNewsRecipe):
return soup
feeds = [
(u'Technology', u'http://discovermagazine.com/topics/technology/rss.xml'),
(u'Health - Medicine',
u'http://discovermagazine.com/topics/health-medicine/rss.xml'),
(u'Mind Brain', u'http://discovermagazine.com/topics/mind-brain/rss.xml'),
(u'Space', u'http://discovermagazine.com/topics/space/rss.xml'),
(u'Human Origins', u'http://discovermagazine.com/topics/human-origins/rss.xml'),
(u'Living World', u'http://discovermagazine.com/topics/living-world/rss.xml'),
(u'Environment', u'http://discovermagazine.com/topics/environment/rss.xml'),
(u'Physics & Math', u'http://discovermagazine.com/topics/physics-math/rss.xml'),
(u'Technologiy', u'http://feeds.feedburner.com/DiscoverTechnology'),
(u'Health & Medicine', u'http://feeds.feedburner.com/DiscoverHealthMedicine'),
(u'Mind Brain', u'http://feeds.feedburner.com/DiscoverMindBrain'),
(u'Space & Physics', u'http://feeds.feedburner.com/DiscoverSpace'),
(u'Living World', u'http://feeds.feedburner.com/DiscoverLivingWorld'),
(u'Environment', u'http://feeds.feedburner.com/DiscoverEnvironment'),
(u"20 Things you didn't know about...",
u'http://discovermagazine.com/columns/20-things-you-didnt-know/rss.xml'),
(u'Fuzzy Math', u'http://discovermagazine.com/columns/fuzzy-math/rss.xml'),
(u'The Brain', u'http://discovermagazine.com/columns/the-brain/rss.xml'),
(u'What is This', u'http://discovermagazine.com/columns/what-is-this/rss.xml'),
(u'Vital Signs', u'http://discovermagazine.com/columns/vital-signs/rss.xml'),
(u'Think Tech', u'http://discovermagazine.com/columns/think-tech/rss.xml'),
(u'Future Tech', u'http://discovermagazine.com/columns/future-tech/rss.xml'),
(u'Discover Interview',
u'http://discovermagazine.com/columns/discover-interview/rss.xml'),
u'http://feeds.feedburner.com/20ThingsYouDidntKnowAbout'),
(u'Vital Signs', u'http://feeds.feedburner.com/discovermagazine/VitalSigns'),
]

View File

@ -18,9 +18,12 @@ class EandP(BasicNewsRecipe):
encoding = 'utf8'
cover_url = 'http://www.editorandpublisher.com/images/EP_main_logo.gif'
remove_javascript = True
auto_cleanup = True
html2lrf_options = [
'--comment', description, '--category', category, '--publisher', publisher
'--comment', description,
'--category', category,
'--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + \
@ -34,21 +37,11 @@ class EandP(BasicNewsRecipe):
h2{font-size: large;}
'''
# Keep only div:itemmgap
keep_only_tags = [
dict(name='div', attrs={'class': 'itemmgap'})
]
# Remove commenting/social media lins
remove_tags_after = [dict(name='div', attrs={'class': 'clear'})]
feeds = [(u'Breaking News', u'http://www.editorandpublisher.com/GenerateRssFeed.aspx'),
(u'Business News',
u'http://www.editorandpublisher.com/GenerateRssFeed.aspx?CategoryId=2'),
(u'Ad/Circ News', u'http://www.editorandpublisher.com/GenerateRssFeed.aspx?CategoryId=3'),
(u'Newsroom', u'http://www.editorandpublisher.com/GenerateRssFeed.aspx?CategoryId=4'),
(u'Technology News',
u'http://www.editorandpublisher.com/GenerateRssFeed.aspx?CategoryId=5'),
(u'Syndicates News', u'http://www.editorandpublisher.com/GenerateRssFeed.aspx?CategoryId=7')]
feeds = [
(u'Editor & Publisher', u'http://www.editorandpublisher.com/feed/'),
(u'Comments', u'http://www.editorandpublisher.com/comments/feed/'),
]

View File

@ -5,32 +5,9 @@ class AdvancedUserRecipe1295088390(BasicNewsRecipe):
title = u'Everett Herald'
language = 'en'
__author__ = '77ja65'
oldest_article = 4
oldest_article = 7
max_articles_per_feed = 50
no_stylesheets = True
masthead_url = 'http://heraldnet.com/images/hnet/jQueryComponents/jQueryNavigation/heraldnet_logo.png'
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
auto_cleanup = True
feeds = [(u'Local News',
u'http://heraldnet.com/section/RSS02&mime=xml'),
(u'Sports', u'http://heraldnet.com/section/RSS04&mime=xml'),
(u'Entertainment',
u'http://heraldnet.com/section/RSS07&mime=xml'),
(u'Life', u'http://heraldnet.com/section/RSS03&mime=xml'),
(u'Breaking News',
u'http://heraldnet.com/section/RSS34&mime=xml'),
(u'Seahawks', u'http://heraldnet.com/section/RSS22&mime=xml'),
(u'HeraldNet', u'http://heraldnet.com/section/RSS01&mime=xml'),
(u'Inside Everett',
u'http://heraldnet.com/section/RSS26&mime=xml')
]
def print_version(self, url):
return url + "&template=PrinterFriendly"
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-
weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-
weight:normal;font-size:small;}
'''
feeds = [(u'Local News', u'http://www.heraldnet.com/feed/')]

View File

@ -6,101 +6,29 @@ class FairbanksDailyNewsminer(BasicNewsRecipe):
__author__ = 'Roger'
oldest_article = 7
max_articles_per_feed = 100
description = 'The voice of interior Alaska since 1903'
publisher = 'http://www.newsminer.com/'
category = 'news, Alaska, Fairbanks'
language = 'en'
# Make article titles, author and date bold, italic or small font.
# http://assets.matchbin.com/sites/635/stylesheets/newsminer.com.css
# (signature_line contains date, views, comments)
extra_css = '''
.story_item_headline { font-size: medium; font-weight: bold; }
.story_item_author { font-size: small; font-style:italic; }
.signature_line { font-size: small; }
'''
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
language = 'en'
encoding = 'utf8'
conversion_options = {'linearize_tables': True}
auto_cleanup = True
# TODO: The News-miner cover image seems a bit small. Can this be
# enlarged by 10-30%?
masthead_url = 'http://d2uh5w9wm14i0w.cloudfront.net/sites/635/assets/top_masthead_-_menu_pic.jpg'
# In order to omit seeing number of views, number of posts and the pipe
# symbol for divider after the title and date of the article, a regex or
# manual processing is needed to get just the "story_item_date updated"
# (which contains the date). Everything else on this line is pretty much not needed.
#
# Currently, you will see the following:
# | Aug 24, 2011 | 654 views | 6 | |
# (ie. 6 comments)
#
# The following was suggested, but it looks like I also need to define self & soup
# (as well as bring in extra soup depends?)
# date = self.tag_to_string(soup.find('span', attrs={'class':'story_item_date updated'}))
# preprocess_regexps = [(re.compile(r'<span[^>]*addthis_separator*>'), lambda match: '') ]
# preprocess_regexps = [(re.compile(r'span class="addthis_separator">|</span>'), lambda match: '') ]
# preprocess_regexps = [
# (re.compile(r'<start>.*?<end>', re.IGNORECASE | re.DOTALL), lambda match : ''),
# ]
# def get_browser(self):
# def preprocess_html(soup, first_fetch):
# date = self.tag_to_string(soup.find('span', attrs={'class':'story_item_date updated'}))
# return
# preprocess_regexps = [(re.compile(r'&nbsp;|.*?', re.DOTALL), lambda m: '')]
keep_only_tags = [
dict(name='div', attrs={'class': 'story_item_headline entry-title'}),
dict(name='div', attrs={'class': 'full_story'})
]
remove_tags = [
# Try getting rid of some signature_line (date line) stuff
dict(name='img', attrs={'class': 'dont_touch_me'}),
dict(name='span', attrs={
'class': 'number_recommendations'}),
# Removes div within <!-- AddThis Button BEGIN --> <!--
# AddThis Button END -->
dict(name='div', attrs={
'class': 'addthis_toolbox addthis_default_style'}),
dict(name='div', attrs={'class': 'related_content'}),
dict(name='div', attrs={'id': 'comments_container'})
]
# Comment-out or uncomment any of the following RSS feeds according to your
# liking.
#
# TODO: Some random bits of text might be trailing the last page (or TOC on
# MOBI files), these are bits of public posts and comments and need to also
# be removed.
#
feeds = [
(u'Alaska News', u'http://newsminer.com/rss/rss_feeds/alaska_news?content_type=article&tags=alaska_news&page_name=rss_feeds&instance=alaska_news'),
(u'Local News', u'http://newsminer.com/rss/rss_feeds/local_news?content_type=article&tags=local_news&page_name=rss_feeds&offset=0&instance=local_news'),
(u'Business', u'http://newsminer.com/rss/rss_feeds/business_news?content_type=article&tags=business_news&page_name=rss_feeds&instance=business_news'),
(u'Politics', u'http://newsminer.com/rss/rss_feeds/politics_news?content_type=article&tags=politics_news&page_name=rss_feeds&instance=politics_news'),
(u'Sports', u'http://newsminer.com/rss/rss_feeds/sports_news?content_type=article&tags=sports_news&page_name=rss_feeds&instance=sports_news'),
(u'Latitude 65 feed', u'http://newsminer.com/rss/rss_feeds/latitude_65?content_type=article&tags=latitude_65&page_name=rss_feeds&offset=0&instance=latitude_65'), # noqa
# (u'Sundays', u'http://newsminer.com/rss/rss_feeds/Sundays?content_type=article&tags=alaska_science_forum+scott_mccrea+interior_gardening+in_the_bush+judy_ferguson+book_reviews+theresa_bakker+judith_kleinfeld+interior_scrapbook+nuggets_comics+freeze_frame&page_name=rss_feeds&tag_inclusion=or&instance=Sundays'), # noqa
(u'Outdoors', u'http://newsminer.com/rss/rss_feeds/Outdoors?content_type=article&tags=outdoors&page_name=rss_feeds&instance=Outdoors'),
# (u'Fairbanks Grizzlies', u'http://newsminer.com/rss/rss_feeds/fairbanks_grizzlies?content_type=article&tags=fairbanks_grizzlies&page_name=rss_feeds&instance=fairbanks_grizzlies'), # noqa
# (u'Newsminer', u'http://newsminer.com/rss/rss_feeds/Newsminer?content_type=article&tags=ted_stevens_bullets+ted_stevens+sports_news+business_news+fairbanks_grizzlies+dermot_cole_column+outdoors+alaska_science_forum+scott_mccrea+interior_gardening+in_the_bush+judy_ferguson+book_reviews+theresa_bakker+judith_kleinfeld+interior_scrapbook+nuggets_comics+freeze_frame&page_name=rss_feeds&tag_inclusion=or&instance=Newsminer'), # noqa
(u'Opinion', u'http://newsminer.com/rss/rss_feeds/Opinion?content_type=article&tags=editorials&page_name=rss_feeds&instance=Opinion'),
(u'Youth', u'http://newsminer.com/rss/rss_feeds/Youth?content_type=article&tags=youth&page_name=rss_feeds&instance=Youth'),
# (u'Dermot Cole Blog', u'http://newsminer.com/rss/rss_feeds/dermot_cole_blog+rss?content_type=blog+entry&sort_by=posted_on&user_ids=3015275&page_name=blogs_dermot_cole&limit=10&instance=dermot_cole_blog+rss'), # noqa
(u'Dermot Cole Column', u'http://newsminer.com/rss/rss_feeds/Dermot_Cole_column?content_type=article&tags=dermot_cole_column&page_name=rss_feeds&instance=Dermot_Cole_column'), # noqa
# (u'Sarah Palin', u'http://newsminer.com/rss/rss_feeds/sarah_palin?content_type=article&tags=palin_in_the_news+palin_on_the_issues&page_name=rss_feeds&tag_inclusion=or&instance=sarah_palin') # noqa
(u'Alaska News',
u'http://www.newsminer.com/search/?f=rss&t=article&c=news/alaska_news&l=50&s=start_time&sd=desc'),
(u'Local News',
u'http://www.newsminer.com/search/?f=rss&t=article&c=news/local_news&l=50&s=start_time&sd=desc'),
(u'Business',
u'http://www.newsminer.com/search/?f=rss&t=article&c=business&l=50&s=start_time&sd=desc'),
(u'Politics',
u'http://www.newsminer.com/search/?f=rss&t=article&c=news/politics&l=50&s=start_time&sd=desc'),
(u'Sports',
u'http://www.newsminer.com/search/?f=rss&t=article&c=sports&l=50&s=start_time&sd=desc'),
(u'Opinion',
u'http://www.newsminer.com/search/?f=rss&t=article&c=opinion&l=50&s=start_time&sd=desc'),
]

View File

@ -20,6 +20,7 @@ class FanGraphs(BasicNewsRecipe):
category = 'Baseball'
language = 'en'
publication_type = 'Blog'
auto_cleanup = True
description = 'Baseball statistical analysis, graphs, and projections.'
__author__ = 'David Appelman'
@ -27,9 +28,8 @@ class FanGraphs(BasicNewsRecipe):
feeds = [
(u'Fangraphs', u'http://feeds.feedburner.com/FanGraphs?format=xml'),
(u'Rotographs', u'http://www.wizardrss.com/feed/feeds.feedburner.com/RotoGraphs?format=xml'),
(u'Community', u'http://www.wizardrss.com/feed/www.fangraphs.com/community/?feed=rss2'),
(u'NotGraphs', u'http://www.wizardrss.com/feed/www.fangraphs.com/not/?feed=rss2')]
(u'Rotographs', u'http://feeds.feedburner.com/RotoGraphs?format=xml'),
(u'NotGraphs', u'http://feeds.feedburner.com/NotGraphs?format=xml')]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}

View File

@ -10,37 +10,27 @@ class AdvancedUserRecipe1305547242(BasicNewsRecipe):
language = 'en'
remove_javascript = True
__author__ = 'Anonymous'
remove_tags = [
dict(name='div', attrs={'class': 'articles_footer', 'class': 'printoptions'})]
auto_cleanup = True
def print_version(self, url):
return url + '?printable=true'
def preprocess_html(self, soup):
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup
feeds = [ (u'All Fashion', u'http://feeds.glamour.com/glamour/all_fashion'),
(u'All Beauty', u'http://feeds.glamour.com/glamour/all_beauty'),
feeds = [
(u'All Fashion',
u'http://feeds.glamour.com/glamour/all_fashion'),
(u'All Beauty',
u'http://feeds.glamour.com/glamour/all_beauty'),
(u'All Sex, Love & Life',
u'http://feeds.glamour.com/glamour/sex_love_life'),
(u'All Health & Fitness',
u'http://feeds.glamour.com/glamour/health_fitness'),
(u'Shopping', u'http://feeds.glamour.com/glamour/shopping'),
(u'Slaves to Fashion blog',
u'http://feeds.glamour.com/glamour/slavestofashion'),
(u'The Girls in the Beauty Department',
u'http://feeds.glamour.com/glamour/thegirlsinthebeautydepartment'),
(u'Smitten blog', u'http://feeds.glamour.com/glamour/smitten'),
(u'Save the Date', u'http://feeds.feedburner.com/glamour/save-the-date'),
(u'Single-ish blog', u'http://feeds.glamour.com/glamour/glamoursingle-ish'),
(u'Save the Date', u'http://feeds.feedburner.com/glamour/save-the-date'),
(u'Vitamin G blog', u'http://feeds.glamour.com/glamour/vitamin-g'),
(u'Margarita Shapes Up blog',
u'http://feeds.glamour.com/glamour/margaritashapesup'),
(u'Little Miss Fortune blog',
u'http://feeds.glamour.com/glamour/little-miss-fortune'),
(u'Smitten blog',
u'http://feeds.glamour.com/glamour/smitten'),
(u'Save the Date',
u'http://feeds.feedburner.com/glamour/save-the-date'),
(u'Save the Date',
u'http://feeds.feedburner.com/glamour/save-the-date'),
(u'Vitamin G blog',
u'http://feeds.glamour.com/glamour/vitamin-g'),
]

View File

@ -19,29 +19,19 @@ class NewsandRecord(BasicNewsRecipe):
encoding = 'utf-8'
remove_javascript = True
no_stylesheets = True
auto_cleanup = True
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
remove_tags_before = dict(name='h3', attrs={'class': 'nrcTxt_headline'})
remove_tags_after = dict(name='div', attrs={'id': 'nrcBlk_ContentBody'})
remove_tags = [
dict(name='iframe'),
dict(name=['notags', 'embed', 'object', 'link', 'img']),
]
feeds = [
('News', 'http://www.news-record.com/news/archive/feed'),
('Greensboro News', 'http://www.news-record.com/news/greensboro/feed'),
('Education', 'http://www.news-record.com/news/education/feed'),
('Government', 'http://www.news-record.com/news/government/feed'),
('College Sports', 'http://www.news-record.com/sports/college/feed'),
('Sports Extra', 'http://www.news-record.com/blog/sportsextra/feed'),
('Life', 'http://www.news-record.com/life/top/feed'),
('NASCAR', 'http://www.news-record.com/sports/nascar/top/feed'),
('Editorials', 'http://www.news-record.com/opinion/editorials/feed'),
('Letters to the Editor', 'http://www.news-record.com/opinion/letters/feed')
('News', 'http://www.greensboro.com/search/?q=&t=article&l=10&d=&d1=&d2=&s=start_time&sd=desc&c[]=news,news/*&f=rss'),
('Greensboro News', 'http://www.greensboro.com/search/?q=&t=article&l=100&d=&d1=&d2=&s=start_time&sd=desc&nsa=eedition&c[]=news/local,news/crime,news/goverment,news/schools,news/rockingham_county,news/local,news/crime,news/goverment,news/schools,news/rockingham_county/*&f=rss'),
('Business', 'http://www.greensboro.com/search/?q=&t=article&l=10&d=&d1=&d2=&s=start_time&sd=desc&c[]=business,business/*&f=rss'),
('Local Business', 'http://www.greensboro.com/search/?q=&t=article&l=100&d=&d1=&d2=&s=start_time&sd=desc&nsa=eedition&c[]=business/local_business,business/local_business/*&f=rss'),
('Sports', 'http://www.greensboro.com/search/?q=&t=article&l=10&d=&d1=&d2=&s=start_time&sd=desc&c[]=sports,sports/*&f=rss'),
('College Sports', 'http://www.greensboro.com/search/?q=&t=article&l=100&d=&d1=&d2=&s=start_time&sd=desc&nsa=eedition&c[]=sports/college,sports/college/*&f=rss'),
('Sports Extra', 'http://www.greensboro.com/search/?q=&t=article&l=100&d=&d1=&d2=&s=start_time&sd=desc&nsa=eedition&c[]=blogs/sports_extra,blogs/sports_extra/*&f=rss'),
('Life', 'http://www.greensboro.com/search/?q=&t=article&l=10&d=&d1=&d2=&s=start_time&sd=desc&c[]=life,life/*&f=rss'),
]

View File

@ -12,36 +12,11 @@ class ChicagoTribune(BasicNewsRecipe):
__author__ = 'Being and Sujata Raman'
description = 'Politics, local and business news from Hartford'
language = 'en'
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
auto_cleanup = True
keep_only_tags = [dict(name='div', attrs={'class': ["story", "entry-asset asset hentry"]}),
dict(name='div', attrs={
'id': ["pagebody", "story", "maincontentcontainer"]}),
]
remove_tags_after = [{'class': ['photo_article', ]}]
remove_tags = [
{'id': ["moduleArticleTools", "content-bottom", "rail", "articleRelates module", "toolSet", "relatedrailcontent", "div-wrapper", "beta", "atp-comments", "footer"]}, # noqa
{'class': ["clearfix", "relatedTitle", "articleRelates module", "asset-footer", "tools", "comments",
"featurePromo", "featurePromo fp-topjobs brownBackground", "clearfix fullSpan brownBackground", "curvedContent"]},
dict(name='font', attrs={'id': ["cr-other-headlines"]})]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
.byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
.date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center}
.story{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
feeds = [
('Breaking News', 'http://feeds.feedburner.com/courant-breaking-news/'),
('Nation/World News', 'http://feeds.feedburner.com/courant-nation-world/'),
@ -64,29 +39,9 @@ class ChicagoTribune(BasicNewsRecipe):
('Music', 'http://feeds.feedburner.com/courant-music/'),
('TV', 'http://feeds.feedburner.com/courant-tv/'),
('Movies', 'http://feeds.feedburner.com/courant-movies/'),
# ('Metromix headlines', 'http://feeds.feedburner.com/metromix/topheadlines/'),
# ('Metromix events', 'http://feeds.feedburner.com/metromix/events/'),
# ('Metromix restaurants', 'http://feeds.feedburner.com/metromix/restaurants/'),
('Outdoors', 'http://feeds.feedburner.com/courant-outdoors/'),
('Peter Marteka', 'http://feeds.feedburner.com/courant-marteka-column/'),
('Susan Campbell', 'http://feeds.feedburner.com/courant-campbell-column/'),
('Helen Ubinas', 'http://feeds.feedburner.com/courant-helen-ubinas-column/'),
('Jim Shea', 'http://feeds.feedburner.com/courant-jim-shea-column/'),
('Tom Condon', 'http://feeds.feedburner.com/courant-tom-condon-column/'),
('Colin McEnroe', 'http://feeds.feedburner.com/courant-colin-mcenroe-column/'),
]
def get_article_url(self, article):
print article.get('feedburner_origlink', article.get('guid', article.get('link')))
return article.get('feedburner_origlink', article.get('guid', article.get('link')))
def postprocess_html(self, soup, first_fetch):
for t in soup.findAll(['table', 'tr', 'td']):
t.name = 'div'
for tag in soup.findAll('form', dict(attrs={'name': ["comments_form"]})):
tag.extract()
for tag in soup.findAll('font', dict(attrs={'id': ["cr-other-headlines"]})):
tag.extract()
return soup

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 459 B