This commit is contained in:
Kovid Goyal 2016-10-25 20:28:06 +05:30
commit d664b749dc
18 changed files with 0 additions and 818 deletions

View File

@ -1,36 +0,0 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.accountancyage.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class AccountancyAge(BasicNewsRecipe):
title = 'Accountancy Age'
__author__ = 'Darko Miletic'
description = 'business news'
publisher = 'accountancyage.com'
category = 'news, politics, finances'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
simultaneous_downloads = 1
encoding = 'utf-8'
lang = 'en'
language = 'en'
feeds = [
(u'All News', u'http://feeds.accountancyage.com/rss/latest/accountancyage/all')]
keep_only_tags = [
dict(name='h1'),
dict(attrs={'class': 'article_content'}),
]
def get_article_url(self, article):
return article.get('guid', None)

View File

@ -1,27 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Oskar Kunicki <rakso at interia.pl>'
'''
Changelog:
2011-11-27
News from BluesRSS.info
'''
from calibre.web.feeds.news import BasicNewsRecipe
class BluesRSS(BasicNewsRecipe):
title = 'Blues News'
__author__ = 'Oskar Kunicki'
description = 'Blues news from around the world'
publisher = 'BluesRSS.info'
category = 'news, blues, USA,UK'
oldest_article = 5
max_articles_per_feed = 100
language = 'en'
cover_url = 'http://bluesrss.info/cover.jpg'
masthead_url = 'http://bluesrss.info/cover.jpg'
no_stylesheets = True
remove_tags = [dict(name='div', attrs={'class': 'wp-pagenavi'})]
feeds = [(u'News', u'http://bluesrss.info/feed/')]

View File

@ -1,61 +0,0 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__author__ = 'DrMerry Based on v1.01 by Lorenzo Vigentini'
__copyright__ = 'For version 1.02, 1.03: DrMerry'
__version__ = 'v1.03'
__date__ = '11, July 2011'
'''
http://www.computeractive.co.uk/
'''
from calibre.web.feeds.news import BasicNewsRecipe
import re
class computeractive(BasicNewsRecipe):
__author__ = 'DrMerry'
description = 'Computeractive publishes new downloads, reviews, news stories, step-by-step guides and answers to PC problems every day.'
cover_url = 'http://images.pcworld.com/images/common/header/header-logo.gif'
title = 'Computer act!ve'
publisher = 'Incisive media'
category = 'PC, video, computing, product reviews, editing, cameras, production'
language = 'en'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 7
max_articles_per_feed = 25
use_embedded_content = False
recursion = 10
remove_javascript = True
no_stylesheets = True
remove_empty_feeds = True
remove_tags_after = dict(name='div', attrs={'class': 'article_tags_block'})
keep_only_tags = [
dict(name='div', attrs={'id': 'container_left'})
]
remove_tags = [
dict(name='div', attrs={'id': ['seeAlsoTags', 'commentsModule', 'relatedArticles',
'mainLeft', 'mainRight', 'recent_comment_block_parent', 'reviewDetails']}),
dict(name='div', attrs={'class': ['buyIt', 'detailMpu', 'small_section', 'recent_comment_block_parent',
'title_right_button_fix', 'section_title.title_right_button_fix', 'common_button']}),
dict(name='a', attrs={'class': 'largerImage'})
]
preprocess_regexps = [
(re.compile(r'(<a [^>]*>|</a>)', re.DOTALL | re.IGNORECASE),
lambda match: ''),
]
feeds = [
(u'General content',
u'http://feeds.computeractive.co.uk/rss/latest/computeractive/all'),
(u'News', u'http://feeds.computeractive.co.uk/rss/latest/computeractive/news'),
]

View File

@ -1,61 +0,0 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__copyright__ = '2010, Starson17'
'''
www.epicurious.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Epicurious(BasicNewsRecipe):
title = u'Epicurious'
__author__ = 'Starson17'
description = 'Food and Recipes from Epicurious'
cover_url = 'http://up6.podbean.com/image-logos/21849_logo.jpg'
publisher = 'Epicurious'
tags = 'news, food, gourmet, recipes'
language = 'en'
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
recursions = 3
oldest_article = 14
max_articles_per_feed = 20
keep_only_tags = [dict(name='div', attrs={'class': ['mainconsolewrapper', 'videoheader', 'content_unit', 'entry-content', 'see_more_block']}),
dict(name='div', attrs={'id': [
'headline', 'introBlock', 'ingredients', 'preparation', 'articleContent', 'in_categories_block']})
]
remove_tags = [{'id': ['printShoppingList', 'addnoteLnk', 'btnUploadVideo', 'enlarge_image']},
{'class': ['subLnk', 'sbmWrapper', 'detail_division',
'entry-footer', 'comment-footer']},
dict(name='div', attrs={'class': ['tagged', 'comments']})
]
remove_tags_after = [dict(name='div', attrs={'class': 'entry-content'})]
feeds = [
(u'Recipes: Healthy dinner ', u'http://feeds.epicurious.com/healthy_recipes'),
(u'New Recipes ', u'http://feeds.epicurious.com/newrecipes'),
(u'Features ', u'http://feeds.epicurious.com/latestfeatures'),
(u'Blogs ', u'http://feeds.feedburner.com/epicurious/epiblog')
]
match_regexps = [
r'http://www.epicurious.com/.*recipes/.*/views'
]
preprocess_regexps = [
(re.compile(r'/\n', re.DOTALL | re.IGNORECASE), lambda match: '/'),
(re.compile(r'_116.jpg', re.DOTALL | re.IGNORECASE), lambda match: '.jpg'),
(re.compile('<div class=\"comments\".*</body>',
re.DOTALL | re.IGNORECASE), lambda match: '</body>')
]
def postprocess_html(self, soup, first_fetch):
for t in soup.findAll(['table', 'tr', 'td']):
t.name = 'div'
return soup

Binary file not shown.

Before

Width:  |  Height:  |  Size: 607 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 270 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 182 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 820 B

View File

@ -1,43 +0,0 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
'''
juventudrebelde.co.cu
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Juventudrebelde_english(BasicNewsRecipe):
title = 'Juventud Rebelde in english'
__author__ = 'Darko Miletic'
description = 'The newspaper of Cuban Youth'
publisher = 'Juventud Rebelde'
category = 'news, politics, Cuba'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'iso-8859-1'
remove_javascript = True
html2lrf_options = [
'--comment', description, '--category', category, '--publisher', publisher, '--ignore-tables'
]
html2epub_options = 'publisher="' + publisher + \
'"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class': 'read'})]
feeds = [(u'All news', u'http://www.juventudrebelde.cip.cu/rss/all/')]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-CU"/>'
soup.head.insert(0, mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = 'en'

View File

@ -1,95 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Fetch Linuxdevices.
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class LinuxDevices(BasicNewsRecipe):
title = u'Linuxdevices'
description = 'News about Linux driven Hardware'
__author__ = 'Oliver Niesner'
use_embedded_content = False
timefmt = ' [%a %d %b %Y]'
max_articles_per_feed = 50
no_stylesheets = True
language = 'en'
remove_javascript = True
conversion_options = {'linearize_tables': True}
encoding = 'latin1'
remove_tags_after = [dict(id='intelliTxt')]
filter_regexps = [r'ad\.doubleclick\.net']
remove_tags = [dict(name='div', attrs={'class': 'bannerSuperBanner'}),
dict(name='div', attrs={'class': 'bannerSky'}),
dict(name='div', attrs={'border': '0'}),
dict(name='div', attrs={'class': 'footerLinks'}),
dict(name='div', attrs={'class': 'seitenanfang'}),
dict(name='td', attrs={'class': 'mar5'}),
dict(name='table', attrs={'class': 'pageAktiv'}),
dict(name='table', attrs={'class': 'xartable'}),
dict(name='table', attrs={'class': 'wpnavi'}),
dict(name='table', attrs={'class': 'bgcontent absatz'}),
dict(name='table', attrs={'class': 'footer'}),
dict(name='table', attrs={'class': 'artikelBox'}),
dict(name='table', attrs={'class': 'kommentare'}),
dict(name='table', attrs={'class': 'pageBoxBot'}),
dict(name='table', attrs={'td': 'height="3"'}),
dict(name='table', attrs={'class': 'contentpaneopen'}),
dict(name='td', attrs={'nowrap': 'nowrap'}),
dict(name='td', attrs={'align': 'left'}),
dict(name='td', attrs={'height': '5'}),
dict(name='td', attrs={'class': 'ArticleWidgetsHeadline'}),
dict(name='div', attrs={
'class': 'artikelBox navigatorBox'}),
dict(name='div', attrs={'class': 'similar-article-box'}),
dict(name='div', attrs={'class': 'videoBigHack'}),
dict(name='td', attrs={'class': 'artikelDruckenRight'}),
dict(name='td', attrs={'class': 'width="200"'}),
dict(name='span', attrs={'class': 'content_rating'}),
dict(name='a', attrs={
'href': 'http://www.addthis.com/bookmark.php'}),
dict(name='a', attrs={'href': '/news'}),
dict(name='a', attrs={
'href': '/cgi-bin/survey/survey.cgi'}),
dict(name='a', attrs={
'href': '/cgi-bin/board/UltraBoard.pl'}),
dict(name='iframe'),
dict(name='form'),
dict(name='span', attrs={'class': 'hidePrint'}),
dict(id='ArticleWidgets'),
dict(id='headerLBox'),
dict(id='nointelliTXT'),
dict(id='rechteSpalte'),
dict(id='newsticker-list-small'),
dict(id='ntop5'),
dict(id='ntop5send'),
dict(id='ntop5commented'),
dict(id='nnav-bgheader'),
dict(id='nnav-headerteaser'),
dict(id='nnav-head'),
dict(id='nnav-top'),
dict(id='readcomment')]
feeds = [(u'Linuxdevices', u'http://www.linuxfordevices.com/rss.xml')]
def preprocess_html(self, soup):
match = re.compile(r"^Related")
for item in soup.findAll('b', text=match):
item.extract()
for item in soup.findAll(re.compile('^ul')):
item.extract()
for item in soup.findAll('br', limit=10):
item.extract()
return soup
def postprocess_html(self, soup, first):
for tag in soup.findAll(name=['table', 'tr', 'td']):
tag.name = 'div'
return soup

View File

@ -1,72 +0,0 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__author__ = 'Lorenzo Vigentini'
__copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
__version__ = 'v1.01'
__date__ = '14, January 2010'
'''
http://www.macvideo.tv/
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
temp_files = []
articles_are_obfuscated = True
class macVideo(BasicNewsRecipe):
__author__ = 'Lorenzo Vigentini'
description = 'MacVideo is an independent journal not affiliated with Apple Computer, It is a publication of IDG Communication focusing on video production and editing.' # noqa
cover_url = 'http://www.macvideo.tv/images/shared/macvideo-logo.jpg'
title = 'MacVideo '
publisher = 'IDG Communication'
category = 'Apple, Mac, video, computing, product reviews, editing, cameras, production'
language = 'en'
encoding = 'cp1252'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 30
max_articles_per_feed = 25
use_embedded_content = False
recursion = 10
remove_javascript = True
no_stylesheets = True
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url + '&print')
response = br.follow_link(url, nr=0)
html = response.read()
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name
keep_only_tags = [
dict(name='div', attrs={'id': 'mainContent'})
]
remove_tags = [
dict(name='div', attrs={'class': ['submissionBar', 'mpuContainer']}),
dict(name='p', attrs={'class': 'articlePag'}),
dict(name='ul', attrs={'id': 'articleIconsList'})
]
feeds = [
(u'News', u'http://www.macvideo.tv/rss/feeds/macvideo-news.xml'),
(u'Reviews', u'http://www.macvideo.tv/rss/feeds/macvideo-reviews.xml'),
(u'Interviews', u'http://www.macvideo.tv/rss/feeds/macvideo-features-interviews.xml'),
(u'Features', u'http://www.macvideo.tv/rss/feeds/macvideo-features-features.xml'),
(u'Rick Young', u'http://www.macvideo.tv/rss/feeds/blog100140.xml'),
(u'Matt Davis', u'http://www.macvideo.tv/rss/feeds/blog101658.xml'),
(u'Adrian Miskelly',
u'http://www.macvideo.tv/rss/feeds/blog101750.xml')
]

View File

@ -1,47 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
'''
www.moneynews.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class MoneyNews(BasicNewsRecipe):
title = 'Moneynews.com'
__author__ = 'Darko Miletic'
description = 'Financial news worldwide'
publisher = 'Newsmax.com'
language = 'en'
category = 'news, finances, USA, business'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf8'
extra_css = 'img{display: block} body{font-family: Arial, Helvetica, sans-serif}'
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True
}
feeds = [
(u'Street Talk', u'http://www.moneynews.com/rss/StreetTalk/8.xml'),
(u'Finance News', u'http://www.moneynews.com/rss/FinanceNews/4.xml'),
(u'Economy', u'http://www.moneynews.com/rss/Economy/2.xml'),
(u'Companies', u'http://www.moneynews.com/rss/Companies/6.xml'),
(u'Markets', u'http://www.moneynews.com/rss/Markets/7.xml'),
(u'Investing & Analysis', u'http://www.moneynews.com/rss/InvestingAnalysis/17.xml')
]
keep_only_tags = [dict(name='div', attrs={'class': 'copy'})]
remove_tags = [
dict(attrs={'class': ['MsoNormal', 'MsoNoSpacing']}),
dict(name=['object', 'link', 'embed', 'form', 'meta'])
]
def print_version(self, url):
nodeid = url.rpartition('/')[2]
return 'http://www.moneynews.com/PrintTemplate?nodeid=' + nodeid

View File

@ -1,23 +0,0 @@
from calibre.web.feeds.news import BasicNewsRecipe
class OpenLeft(BasicNewsRecipe):
# Information about the recipe
title = 'Open Left'
description = 'Progressive American commentary on current events'
category = 'news, commentary'
language = 'en'
__author__ = 'Xanthan Gum'
# Fetch no article older than seven days
oldest_article = 7
# Fetch no more than 100 articles
max_articles_per_feed = 100
# Fetch the articles from the RSS feed
feeds = [(u'Articles', u'http://www.openleft.com/rss/rss2.xml')]

View File

@ -1,114 +0,0 @@
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class RichmondTimesDispatch(BasicNewsRecipe):
title = u'Richmond Times-Dispatch'
description = "The Richmond Times-Dispatch is the primary daily newspaper in Richmond, \
the capital of Virginia, United States, as well as the Virginia cities of Petersburg, \
Chester. Hopewell, Colonial Heights, Charlottesville, Lynchburg, Waynesboro, \
and is also a default paper for rural regions of the state. \
The RTD has published in some form for more than 150 years."
__author__ = '_reader'
__date__ = '17 October 2012'
__version__ = '1.6'
cover_url = 'http://static2.dukecms.com/va_tn/timesdispatch_com/site-media/img/icons/logo252x97.png'
masthead_url = 'http://static2.dukecms.com/va_tn/timesdispatch_com/site-media/img/icons/logo252x97.png'
language = 'en'
oldest_article = 1.5 # days
max_articles_per_feed = 100
ignore_duplicate_articles = {'title', 'url'}
needs_subscription = False
publisher = 'timesdispatch.com'
category = 'news, commentary'
tags = 'news'
publication_type = 'newspaper'
no_stylesheets = True
use_embedded_content = False
encoding = None
simultaneous_downloads = 20
recursions = 0
remove_javascript = True
remove_empty_feeds = True
auto_cleanup = False
conversion_options = {
'comments': description,
'tags': tags,
'language': language,
'publisher': publisher,
'authors': publisher,
'smarten_punctuation': True
}
remove_tags_before = dict(id='hnews hentry item')
remove_tags_after = dict(name='hr')
remove_tags = [
dict(name='div', attrs={'id': ['mg_hd', 'mg_ft', 'sr_b', 'comments_left', 'comments_right']}), dict(name='div', attrs={'class': ['bottom_social', 'article_bottom']}), dict( name='table', attrs={'class': ['ap-mediabox-table', 'ap-htmltable-table', 'ap-photogallery-table', 'ap-htmlfragment-table']}) # noqa
]
preprocess_regexps = [
(re.compile(r'<table class="ap-story-table hnews hentry item".*?<td class="ap-story-td">',
re.DOTALL | re.IGNORECASE), lambda match: ''),
(re.compile(r'<p>\s*http://www2.timesdispatch.*?</p>',
re.DOTALL | re.IGNORECASE), lambda match: ''),
(re.compile(r'<p>\s*<img src="http://static2.dukecms.*?</p>',
re.DOTALL | re.IGNORECASE), lambda match: ''),
(re.compile(r'<p>\s*<a href="http://www2.timesdispatch.*?</p>',
re.DOTALL | re.IGNORECASE), lambda match: ''),
(re.compile(r'<hr.*?>', re.DOTALL | re.IGNORECASE),
lambda match: ''), # strip <hr /> line break
(re.compile(r'<a\s*rel="item-license.*?Use</a>.', re.DOTALL |
re.IGNORECASE), lambda match: ''), # strip <hr /> line break
(re.compile(r'<small>\s*Richmond Times-Dispatch.*?</small>', re.DOTALL |
re.IGNORECASE), lambda match: ''), # strip <hr /> line break
]
feeds = [
('News', 'http://www2.timesdispatch.com/list/feed/rss/news-archive'),
('Breaking News', 'http://www2.timesdispatch.com/list/feed/rss/breaking-news'),
('National News', 'http://www2.timesdispatch.com/list/feed/rss/national-news'),
('Local News', 'http://www2.timesdispatch.com/list/feed/rss/local-news'),
('Business', 'http://www2.timesdispatch.com/list/feed/rss/business'),
('Local Business', 'http://www2.timesdispatch.com/list/feed/rss/local-business'),
('Politics', 'http://www2.timesdispatch.com/list/feed/rss/politics'),
('Virginia Politics',
'http://www2.timesdispatch.com/list/feed/rss/virginia-politics'),
('History', 'http://www2.timesdispatch.com/feed/rss/special_section/news/history'),
('Sports', 'http://www2.timesdispatch.com/list/feed/rss/sports2'),
('Health', 'http://www2.timesdispatch.com/feed/rss/lifestyles/health_med_fit/'),
('Entertainment/Life', 'http://www2.timesdispatch.com/list/feed/rss/entertainment'),
('Arts/Theatre',
'http://www2.timesdispatch.com/feed/rss/entertainment/arts_theatre/'),
('Movies', 'http://www2.timesdispatch.com/list/feed/rss/movies'),
('Music', 'http://www2.timesdispatch.com/list/feed/rss/music'),
('Dining & Food', 'http://www2.timesdispatch.com/list/feed/rss/dining'),
('Home & Garden', 'http://www2.timesdispatch.com/list/feed/rss/home-and-garden/'),
('Travel', 'http://www2.timesdispatch.com/feed/rss/travel/'),
('Opinion', 'http://www2.timesdispatch.com/feed/rss/news/opinion/'),
('Editorials', 'http://www2.timesdispatch.com/list/feed/rss/editorial-desk'),
('Columnists and Blogs',
'http://www2.timesdispatch.com/list/feed/rss/news-columnists-blogs'),
('Opinion Columnists',
'http://www2.timesdispatch.com/list/feed/rss/opinion-editorial-columnists'),
('Letters to the Editor',
'http://www2.timesdispatch.com/list/feed/rss/opinion-letters'),
('Traffic', 'http://www2.timesdispatch.com/list/feed/rss/traffic'),
('Drives', 'http://www2.timesdispatch.com/feed/rss/classifieds/transportation/'),
]
def print_version(self, url):
article_num = re.sub(r'(^.*)\-([0-9]{4,10})\/$', r'\g<2>', url)
ap_pat = re.compile('http')
# print '\nDEBUG>>>>>>>>: article_num: ', article_num
# print 'DEBUG>>>>>>>>: ap_pat.search(article_num): ',
# ap_pat.search(article_num)
if ap_pat.search(article_num): # AP article, no print url
# print 'DEBUG>>>>>>>>: AP URL: ', url
return url
else:
printURL = 'http://www2.timesdispatch.com/member-center/share-this/print/?content=ar' + article_num
return printURL

View File

@ -1,31 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Nadid <nadid.skywalker at gmail.com>'
'''
http://www.sinfest.net
'''
from calibre.web.feeds.news import BasicNewsRecipe
class SinfestBig(BasicNewsRecipe):
title = 'Sinfest'
__author__ = 'nadid'
description = 'Sinfest'
reverse_article_order = False
oldest_article = 5
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = True
encoding = 'utf-8'
publisher = 'Tatsuya Ishida/Museworks'
category = 'comic'
language = 'en'
conversion_options = {
'comments': description, 'tags': category, 'language': language, 'publisher': publisher
}
feeds = [(u'SinFest', u'http://henrik.nyh.se/scrapers/sinfest.rss')]
def get_article_url(self, article):
return article.get('link')

View File

@ -1,39 +0,0 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1278049615(BasicNewsRecipe):
title = u'Statesman'
pubisher = 'http://www.statesman.com/'
description = 'Austin Texas Daily Newspaper'
category = 'News, Austin, Texas'
__author__ = 'rty'
oldest_article = 3
max_articles_per_feed = 100
feeds = [(u'News',
u'http://www.statesman.com/section-rss.do?source=news&includeSubSections=true'),
(u'Local', u'http://www.statesman.com/section-rss.do?source=local&includeSubSections=true'),
(u'Business', u'http://www.statesman.com/section-rss.do?source=business&includeSubSections=true'),
(u'Life', u'http://www.statesman.com/section-rss.do?source=life&includesubsection=true'),
(u'Editorial', u'http://www.statesman.com/section-rss.do?source=opinion&includesubsections=true'),
(u'Sports', u'http://www.statesman.com/section-rss.do?source=sports&includeSubSections=true')
]
masthead_url = "http://www.statesman.com/images/cmg-logo.gif"
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
language = 'en'
encoding = 'utf-8'
conversion_options = {'linearize_tables': True}
remove_tags = [
dict(name='div', attrs={'id': 'cxArticleOptions'}),
{'class': ['perma', 'comments', 'trail', 'share-buttons',
'toggle_show_on']},
]
keep_only_tags = [
dict(name='div', attrs={'class': 'cxArticleHeader'}),
dict(name='div', attrs={'id': ['cxArticleBodyText',
'content']}),
]

View File

@ -1,53 +0,0 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
'''
utne.com
'''
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
class Utne(BasicNewsRecipe):
title = u'Utne reader'
__author__ = 'Darko Miletic'
description = 'News'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
language = 'en'
cover_url = 'http://www.utne.com/images/template/logo.gif'
remove_tags = [
dict(name='a', attrs={'id': 'ctl00_blankmaster_lnkBanner'}), dict(
name='object')
]
feeds = [
(u'Politics', u'http://www.utne.com/rss/Politics.xml'),
(u'Environment', u'http://www.utne.com/rss/Environment.xml'),
(u'Media', u'http://www.utne.com/rss/Media.xml'),
(u'Great writing', u'http://www.utne.com/rss/Great-Writing.xml'),
(u'Science & Technology', u'http://www.utne.com/rss/Science-Technology.xml'),
(u'Arts', u'http://www.utne.com/rss/Arts.xml')
]
def print_version(self, url):
raw = self.browser.open(url).read()
soup = BeautifulSoup(raw.decode('utf8', 'replace'))
print_link = soup.find(
'a', {'id': 'ctl00_defaultmaster_Blog_tools1_lnkPrint'})
if print_link is None:
return url
return print_link['href']
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
soup.head.insert(0, mtag)
del(soup.body['onload'])
return soup

View File

@ -1,116 +0,0 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__copyright__ = '2010, Starson17'
'''
www.nbcolympics.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Olympics_2010(BasicNewsRecipe):
title = u'NBC Olympics 2010'
__author__ = 'Starson17'
description = 'Olympics 2010'
cover_url = 'http://www.digitaljournal.com/img/1/1/2/1/i/4/7/6/o/WinterOlympics2010-logo.jpg'
publisher = 'Olympics 2010'
tags = 'Olympics news'
language = 'en'
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
# recursions = 3
oldest_article = 7
max_articles_per_feed = 10
keep_only_tags = [dict(name='div', attrs={'class': ['Article ', 'ArticleGallery']}),
]
remove_tags = [dict(name='div', attrs={'id': ['RelatedTagsBox', 'ShareBox']}),
dict(name='div', attrs={
'class': ['DateUtilities', 'PhotoGallery BoxRight', 'Frame', 'ToolBox']}),
]
# RSS feeds are at: http://www.nbcolympics.com/rss/index.html
feeds = [
('NBCOlympics.com - News',
'http://www.nbcolympics.com/rss/newscenter/mostpopular.xml'),
('NBCOlympics.com - News - Top Stories',
'http://www.nbcolympics.com/rss/newscenter/topstories.xml'),
('NBCOlympics.com - News - Latest Headlines',
'http://www.nbcolympics.com/rss/newscenter/latestnews.xml'),
# ('NBCOlympics.com - Photos', 'http://www.nbcolympics.com/rss/photos/mostpopular.xml'),
# ('NBCOlympics.com - Photos - Editorial Picks', 'http://www.nbcolympics.com/rss/photos/editorialpicks.xml'),
# ('NBCOlympics.com - Photos - Latest Slideshows', 'http://www.nbcolympics.com/rss/photos/latestslideshows.xml'),
('NBCOlympics.com - Team USA - Latest news',
'http://www.nbcolympics.com/rss/countries/team-usa/index.xml'),
# ('NBCOlympics.com - Team USA - Latest Slideshows', 'http://www.nbcolympics.com/rss/countries/team-usa/photos/index.xml'),
# ('NBCOlympics.com - Team USA - Video', 'http://www.nbcolympics.com/rss/countries/team-usa/video/index.xml'),
# ('NBCOlympics.com - Alpine Skiing - Most Popular News', 'http://www.nbcolympics.com/rss/sport=AS/mostpopular.xml'),
# ('NBCOlympics.com - Alpine Skiing - Top News', 'http://www.nbcolympics.com/rss/sport=AS/topnews.xml'),
('NBCOlympics.com - Alpine Skiing - Latest News',
'http://www.nbcolympics.com/rss/sport=AS/latestnews.xml'),
# ('NBCOlympics.com - Biathlon - Most Popular News', 'http://www.nbcolympics.com/rss/sport=BT/mostpopular.xml'),
# ('NBCOlympics.com - Biathlon - Top News', 'http://www.nbcolympics.com/rss/sport=BT/topnews.xml'),
('NBCOlympics.com - Biathlon - Latest News',
'http://www.nbcolympics.com/rss/sport=BT/latestnews.xml'),
# ('NBCOlympics.com - Bobsled - Most Popular News', 'http://www.nbcolympics.com/rss/sport=BS/mostpopular.xml'),
# ('NBCOlympics.com - Bobsled - Top News', 'http://www.nbcolympics.com/rss/sport=BS/topnews.xml'),
('NBCOlympics.com - Bobsled - Latest News',
'http://www.nbcolympics.com/rss/sport=BS/latestnews.xml'),
# ('NBCOlympics.com - Cross-Country - Most Popular News', 'http://www.nbcolympics.com/rss/sport=CC/mostpopular.xml'),
# ('NBCOlympics.com - Cross-Country - Top News', 'http://www.nbcolympics.com/rss/sport=CC/topnews.xml'),
('NBCOlympics.com - Cross-Country - Latest News',
'http://www.nbcolympics.com/rss/sport=CC/latestnews.xml'),
# ('NBCOlympics.com - Curling - Most Popular News', 'http://www.nbcolympics.com/rss/sport=CU/mostpopular.xml'),
# ('NBCOlympics.com - Curling - Top News', 'http://www.nbcolympics.com/rss/sport=CU/topnews.xml'),
('NBCOlympics.com - Curling - Latest News',
'http://www.nbcolympics.com/rss/sport=CU/latestnews.xml'),
# ('NBCOlympics.com - Figure Skating - Most Popular News', 'http://www.nbcolympics.com/rss/sport=FS/mostpopular.xml'),
# ('NBCOlympics.com - Figure Skating - Top News', 'http://www.nbcolympics.com/rss/sport=FS/topnews.xml'),
('NBCOlympics.com - Figure Skating - Latest News',
'http://www.nbcolympics.com/rss/sport=FS/latestnews.xml'),
# ('NBCOlympics.com - Freestyle Skiing - Most Popular News', 'http://www.nbcolympics.com/rss/sport=FR/mostpopular.xml'),
# ('NBCOlympics.com - Freestyle Skiing - Top News', 'http://www.nbcolympics.com/rss/sport=FR/topnews.xml'),
('NBCOlympics.com - Freestyle Skiing - Latest News',
'http://www.nbcolympics.com/rss/sport=FR/latestnews.xml'),
# ('NBCOlympics.com - Hockey - Most Popular News', 'http://www.nbcolympics.com/rss/sport=IH/mostpopular.xml'),
# ('NBCOlympics.com - Hockey - Top News', 'http://www.nbcolympics.com/rss/sport=IH/topnews.xml'),
('NBCOlympics.com - Hockey - Latest News',
'http://www.nbcolympics.com/rss/sport=IH/latestnews.xml'),
# ('NBCOlympics.com - Luge - Most Popular News', 'http://www.nbcolympics.com/rss/sport=LG/mostpopular.xml'),
# ('NBCOlympics.com - Luge - Top News', 'http://www.nbcolympics.com/rss/sport=LG/topnews.xml'),
('NBCOlympics.com - Luge - Latest News',
'http://www.nbcolympics.com/rss/sport=LG/latestnews.xml'),
# ('NBCOlympics.com - Nordic Combined - Most Popular News', 'http://www.nbcolympics.com/rss/sport=NC/mostpopular.xml'),
# ('NBCOlympics.com - Nordic Combined - Top News', 'http://www.nbcolympics.com/rss/sport=NC/topnews.xml'),
('NBCOlympics.com - Nordic Combined - Latest News',
'http://www.nbcolympics.com/rss/sport=NC/latestnews.xml'),
# ('NBCOlympics.com - Short Track - Most Popular News', 'http://www.nbcolympics.com/rss/sport=ST/mostpopular.xml'),
# ('NBCOlympics.com - Short Track - Top News', 'http://www.nbcolympics.com/rss/sport=ST/topnews.xml'),
('NBCOlympics.com - Short Track - Latest News',
'http://www.nbcolympics.com/rss/sport=ST/latestnews.xml'),
# ('NBCOlympics.com - Skeleton - Most Popular News', 'http://www.nbcolympics.com/rss/sport=SN/mostpopular.xml'),
# ('NBCOlympics.com - Skeleton - Top News', 'http://www.nbcolympics.com/rss/sport=SN/topnews.xml'),
('NBCOlympics.com - Skeleton - Latest News',
'http://www.nbcolympics.com/rss/sport=SN/latestnews.xml'),
# ('NBCOlympics.com - Ski Jumping - Most Popular News', 'http://www.nbcolympics.com/rss/sport=SJ/mostpopular.xml'),
# ('NBCOlympics.com - Ski Jumping - Top News', 'http://www.nbcolympics.com/rss/sport=SJ/topnews.xml'),
('NBCOlympics.com - Ski Jumping - Latest News',
'http://www.nbcolympics.com/rss/sport=SJ/latestnews.xml'),
# ('NBCOlympics.com - Snowboarding - Most Popular News', 'http://www.nbcolympics.com/rss/sport=SB/mostpopular.xml'),
# ('NBCOlympics.com - Snowboarding - Top News', 'http://www.nbcolympics.com/rss/sport=SB/topnews.xml'),
('NBCOlympics.com - Snowboarding - Latest News',
'http://www.nbcolympics.com/rss/sport=SB/latestnews.xml'),
# ('NBCOlympics.com - Speed Skating - Most Popular News', 'http://www.nbcolympics.com/rss/sport=AS/mostpopular.xml'),
# ('NBCOlympics.com - Speed Skating - Top News', 'http://www.nbcolympics.com/rss/sport=AS/topnews.xml'),
('NBCOlympics.com - Speed Skating - Latest News',
'http://www.nbcolympics.com/rss/sport=AS/latestnews.xml'),
]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''