This commit is contained in:
Kovid Goyal 2016-10-14 09:12:25 +05:30
commit 3def2109c0
20 changed files with 137 additions and 494 deletions

View File

@ -19,54 +19,14 @@ class Moscowtimes(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
remove_empty_feeds = True remove_empty_feeds = True
encoding = 'cp1251'
masthead_url = 'http://www.themoscowtimes.com/bitrix/templates/tmt/img/logo.gif'
publication_type = 'newspaper' publication_type = 'newspaper'
auto_cleanup = True
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
extra_css = '''
h1{ color:#0066B3; font-family: Georgia,serif ; font-size: large}
.article_date{ font-family:Arial,Tahoma,Verdana,Helvetica,sans-serif ; color:#000000; font-size: x-small;}
.autors{color:#999999 ; font-weight: bold ; font-family:Arial,Tahoma,Verdana,Helvetica,sans-serif ; font-size: x-small; }
.photoautors{ color:#999999 ; font-family:Arial,Tahoma,Verdana,Helvetica,sans-serif ; font-size: x-small; }
.text{font-family:Arial,Tahoma,Verdana,Helvetica,sans-serif ; font-size:75%; }
'''
feeds = [ feeds = [
(u'Top Stories', u'https://themoscowtimes.com/feeds/main.xml'),
(u'Top Stories', u'http://www.themoscowtimes.com/rss/top'), (u'Moscow', u'https://themoscowtimes.com/feeds/moscow.xml'),
(u'Current Issue', u'http://www.themoscowtimes.com/rss/issue'), (u'Russia', u'https://themoscowtimes.com/feeds/russia.xml'),
(u'News', u'http://www.themoscowtimes.com/rss/news'), (u'World', u'https://themoscowtimes.com/feeds/world.xml'),
(u'Business', u'http://www.themoscowtimes.com/rss/business'), (u'Business', u'https://themoscowtimes.com/feeds/business.xml'),
(u'Art and Ideas', u'http://www.themoscowtimes.com/rss/art'), (u'Opinion', u'https://themoscowtimes.com/feeds/opinion.xml')
(u'Opinion', u'http://www.themoscowtimes.com/rss/opinion')
] ]
keep_only_tags = [dict(name='div', attrs={'id': 'content'})]
remove_tags = [
dict(name='div', attrs={'class': ['photo_nav', 'phototext']}), dict(
name=['iframe', 'meta', 'base', 'link', 'embed', 'object'])
]
def preprocess_html(self, soup):
for lnk in soup.findAll('a'):
if lnk.string is not None:
ind = self.tag_to_string(lnk)
lnk.replaceWith(ind)
return soup
def print_version(self, url):
return url.replace('.themoscowtimes.com/', '.themoscowtimes.com/print/')
def get_cover_url(self):
cover_url = None
href = 'http://www.themoscowtimes.com/pdf/'
soup = self.index_to_soup(href)
div = soup.find('div', attrs={'class': 'left'})
if div:
a = div.find('a')
if a:
cover_url = 'http://www.themoscowtimes.com' + a.img['src']
return cover_url

View File

@ -1,4 +1,3 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
''' '''
@ -11,7 +10,8 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Newstraitstimes(BasicNewsRecipe): class Newstraitstimes(BasicNewsRecipe):
title = 'New Straits Times from Malaysia' title = 'New Straits Times from Malaysia'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Learning Curve, Sunday People, New Straits Times from Malaysia' description = ('Learning Curve, Sunday People, '
'New Straits Times from Malaysia')
publisher = 'nst.com.my' publisher = 'nst.com.my'
category = 'news, politics, Malaysia' category = 'news, politics, Malaysia'
oldest_article = 2 oldest_article = 2
@ -20,13 +20,6 @@ class Newstraitstimes(BasicNewsRecipe):
encoding = 'cp1252' encoding = 'cp1252'
use_embedded_content = False use_embedded_content = False
language = 'en' language = 'en'
masthead_url = 'http://www.nst.com.my/Current_News/NST/Images/new-nstonline.jpg' auto_cleanup = True
conversion_options = { feeds = [(u'Articles', u'http://www.nst.com.my/latest.xml')]
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
remove_tags = [dict(name=['link', 'table'])]
keep_only_tags = dict(name='div', attrs={'id': 'haidah'})
feeds = [(u'Articles', u'http://www.nst.com.my/rss/allSec')]

View File

@ -19,14 +19,6 @@ class OldNewThing(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
publication_type = 'blog' publication_type = 'blog'
extra_css = ' body{font-family: Verdana,Arial,Helvetica,sans-serif} .code{font-family: "Lucida Console",monospace} ' auto_cleanup = True
conversion_options = { feeds = [(u'Posts', u'https://blogs.msdn.microsoft.com/oldnewthing/feed')]
'comment': description, 'tags': 'blog, windows, microsoft, programming', 'publisher': 'Raymond Chen', 'language': language
}
remove_attributes = ['width', 'height']
keep_only_tags = [dict(attrs={'class': 'full-post'})]
remove_tags = [
dict(attrs={'class': ['post-attributes', 'post-tags', 'post-actions']})]
feeds = [(u'Posts', u'http://blogs.msdn.com/oldnewthing/rss.xml')]

View File

@ -32,56 +32,16 @@ class pcAdvisor(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True
auto_cleanup = True
keep_only_tags = [
dict(name='div', attrs={'id': 'articlecontent'})
]
remove_tags = [
dict(name='div', attrs={'id': ['crosssitesignup', 'submitarticle', 'dontPrint',
'commentsForm', 'userReviewFormContainer', 'reevooContainerId']}),
dict(name='div', attrs={'class': 'mpu'}),
dict(name='p', attrs={'id': 'articlePageList'}),
dict(name='div', attrs={
'style': ['margin: 0pt 10px 5px;', 'margin: 0pt 10px 5px;']}),
dict(name='p', attrs={'class': 'dontPrint'}),
dict(name='h2', attrs={'class': 'sectionTitle'}),
dict(name='a', attrs={'title': 'Subscribe to PC Advisor'}),
dict(name='a', attrs={'name': 'revooContent'}),
{'name': ['form', 'script', 'link']}
]
remove_tags_after = [
dict(name='p', attrs={'id': 'crosssitesignup'})
]
def get_article_url(self, article):
return article.get('guid', None)
feeds = [ feeds = [
(u'News Headlines', u'http://www.pcadvisor.co.uk/rss/feeds/pcanews.xml'), (u'Latest', u'http://www.pcadvisor.co.uk/latest/rss'),
(u'Reviews', u'http://www.pcadvisor.co.uk/rss/feeds/pcareviews.xml'), (u'News', u'http://www.pcadvisor.co.uk/news/rss'),
(u'New Products', (u'How-tos', u'http://www.pcadvisor.co.uk/how-to/rss'),
u'http://www.pcadvisor.co.uk/rss/feeds/blog18.xml'), (u'Reviews', u'http://www.pcadvisor.co.uk/review/rss'),
(u'PC Advisor Blog', (u'Video Content', u'http://www.pcadvisor.co.uk/video/rss'),
u'http://www.pcadvisor.co.uk/rss/feeds/blog4.xml'), (u'iPhone', u'http://www.pcadvisor.co.uk/latest/iphone/rss'),
(u'PC Security', (u'iPad', u'http://www.pcadvisor.co.uk/latest/ipad/rss'),
u'http://www.pcadvisor.co.uk/rss/feeds/pca-security.xml'), (u'Mac', u'http://www.pcadvisor.co.uk/latest/mac/rss'),
(u'Laptops', u'http://www.pcadvisor.co.uk/rss/feeds/pca-laptop.xml'), (u'Apple', u'http://www.pcadvisor.co.uk/latest/apple/rss'),
(u'Green Computing',
u'http://www.pcadvisor.co.uk/rss/feeds/pca-green-computing.xml'),
(u'Internet and broadband',
u'http://www.pcadvisor.co.uk/rss/feeds/pca-internet.xml'),
(u'Prones and PDAs',
u'http://www.pcadvisor.co.uk/rss/feeds/pca-phones.xml'),
(u'Software', u'http://www.pcadvisor.co.uk/rss/feeds/pca-software.xml'),
(u'Small Business',
u'http://www.pcadvisor.co.uk/rss/feeds/pca-small-business.xml'),
(u'Photo and video',
u'http://www.pcadvisor.co.uk/rss/feeds/pca-photo-video.xml'),
(u'Mac News', u'http://www.pcadvisor.co.uk/rss/feeds/pca-mac.xml'),
(u'Linux', u'http://www.pcadvisor.co.uk/rss/feeds/pca-linux.xml'),
(u'WiFi and Networking',
u'http://www.pcadvisor.co.uk/rss/feeds/pca-networking.xml'),
(u'Gadgets', u'http://www.pcadvisor.co.uk/rss/feeds/pca-gadgets.xml')
] ]

View File

@ -19,8 +19,6 @@ class HindustanTimes(BasicNewsRecipe):
'http://phys.org/rss-feed/physics-news/'), 'http://phys.org/rss-feed/physics-news/'),
('Space and Earth', ('Space and Earth',
'http://phys.org/rss-feed/space-news/'), 'http://phys.org/rss-feed/space-news/'),
('Electronics',
'http://phys.org/rss-feed/electronics-news/'),
('Chemistry', ('Chemistry',
'http://phys.org/rss-feed/chemistry-news/'), 'http://phys.org/rss-feed/chemistry-news/'),
('Biology', ('Biology',

View File

@ -22,26 +22,28 @@ class Politiken_dk(BasicNewsRecipe):
encoding = 'cp1252' encoding = 'cp1252'
language = 'da' language = 'da'
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } h1{font-family: Georgia,"Times New Roman",Times,serif } ' extra_css = (' body{font-family: Arial,Helvetica,sans-serif } '
'h1{font-family: Georgia,"Times New Roman",Times,serif } ')
conversion_options = { conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language 'comment': description,
'tags': category,
'publisher': publisher,
'language': language
} }
feeds = [ feeds = [
(u'Tophistorier', u'http://politiken.dk/rss/tophistorier.rss'),
(u'Tophistorier', u'http://politiken.dk/rss/tophistorier.rss'), (u'Seneste nyt', u'http://politiken.dk/rss/senestenyt.rss'),
(u'Seneste nyt', u'http://politiken.dk/rss/senestenyt.rss'), (u'Mest laeste', u'http://politiken.dk/rss/mestlaeste.rss'),
(u'Mest laeste', u'http://politiken.dk/rss/mestlaeste.rss'), (u'Danmark', u'http://politiken.dk/rss/indland.rss'),
(u'Danmark', u'http://politiken.dk/rss/indland.rss'), (u'Politik', u'http://politiken.dk/rss/politik.rss'),
(u'Politik', u'http://politiken.dk/rss/politik.rss'), (u'Klima', u'http://politiken.dk/rss/klima.rss'),
(u'Klima', u'http://politiken.dk/rss/klima.rss'), (u'Internationalt', u'http://politiken.dk/rss/udland.rss'),
(u'Internationalt', u'http://politiken.dk/rss/udland.rss'), (u'Erhverv', u'http://politiken.dk/rss/erhverv.rss'),
(u'Erhverv', u'http://politiken.dk/rss/erhverv.rss'), (u'Kultur', u'http://politiken.dk/rss/kultur.rss'),
(u'Kultur', u'http://politiken.dk/rss/kultur.rss'), (u'Sport', u'http://politiken.dk/rss/sport.rss'),
(u'Sport', u'http://politiken.dk/rss/sport.rss'), (u'Uddannelse', u'http://politiken.dk/rss/uddannelse.rss'),
(u'Uddannelse', u'http://politiken.dk/rss/uddannelse.rss'),
(u'Videnskab', u'http://politiken.dk/rss/videnskab.rss')
] ]
remove_tags_before = dict(name='h1') remove_tags_before = dict(name='h1')
remove_tags = [ remove_tags = [

View File

@ -4,7 +4,6 @@ __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
rollingstone.com rollingstone.com
''' '''
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -22,44 +21,8 @@ class RollingStone(BasicNewsRecipe):
language = 'en' language = 'en'
remove_empty_feeds = True remove_empty_feeds = True
publication_type = 'magazine' publication_type = 'magazine'
masthead_url = 'http://www.rollingstone.com/templates/rolling-stone-templates/theme/rstheme/images/rsLogo.png' auto_cleanup = True
extra_css = """
body{font-family: Georgia,Times,serif }
img{margin-bottom: 0.4em; display:block}
"""
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
preprocess_regexps = [
(re.compile(r'xml:lang="en">.*?<head>', re.DOTALL | re.IGNORECASE), lambda match: 'xml:lang="en">\n<head>\n'), (re.compile(
r'</title>.*?</head>', re.DOTALL | re.IGNORECASE), lambda match: '</title>\n</head>\n')
]
keep_only_tags = [
dict(attrs={'class': ['headerImgHolder', 'headerContent']}), dict(name='div', attrs={'id': [
'teaser', 'storyTextContainer']}), dict(name='div', attrs={'class': 'blogDetailModule clearfix'})
]
remove_tags = [
dict(name=['meta', 'iframe', 'object', 'embed']), dict(
attrs={'id': 'mpStoryHeader'}), dict(attrs={'class': 'relatedTopics'})
]
remove_attributes = ['lang', 'onclick', 'width', 'height', 'name']
remove_tags_before = dict(attrs={'class': 'bloggerInfo'})
remove_tags_after = dict(attrs={'class': 'relatedTopics'})
feeds = [ feeds = [
(u'All News', u'http://www.rollingstone.com/siteServices/rss/allNews'),
(u'All News', u'http://www.rollingstone.com/siteServices/rss/allNews'),
(u'All Blogs', u'http://www.rollingstone.com/siteServices/rss/allBlogs'),
(u'Movie Reviews', u'http://www.rollingstone.com/siteServices/rss/movieReviews'),
(u'Album Reviews', u'http://www.rollingstone.com/siteServices/rss/albumReviews'),
(u'Song Reviews', u'http://www.rollingstone.com/siteServices/rss/songReviews')
] ]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -1,45 +0,0 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__author__ = 'Tony Stegall'
__copyright__ = '2010, Tony Stegall or Tonythebookworm on mobileread.com'
__version__ = 'v1.01'
__date__ = '07, October 2010'
__description__ = 'Rolling Stones Mag'
'''
http://www.rollingstone.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class RollingStones(BasicNewsRecipe):
__author__ = 'Tony Stegall'
description = 'Rolling Stones Mag'
cover_url = 'http://gallery.celebritypro.com/data/media/648/kid-rock-rolling-stone-cover.jpg'
masthead_url = 'http://origin.myfonts.com/s/ec/cc-200804/Rolling_Stone-logo.gif'
title = 'Rolling Stones Mag'
category = 'Music Reviews, Movie Reviews, entertainment news'
language = 'en'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 15
max_articles_per_feed = 25
use_embedded_content = False
no_stylesheets = True
auto_cleanup = True
feeds = [
(u'News', u'http://www.rollingstone.com/siteServices/rss/allNews'),
(u'Blogs', u'http://www.rollingstone.com/siteServices/rss/allBlogs'),
(u'Movie Reviews', u'http://www.rollingstone.com/siteServices/rss/movieReviews'),
(u'Album Reviews', u'http://www.rollingstone.com/siteServices/rss/albumReviews'),
(u'Song Reviews', u'http://www.rollingstone.com/siteServices/rss/songReviews'),
]
def print_version(self, url):
return url + '?print=true'

View File

@ -14,6 +14,5 @@ class SanFranciscoBayGuardian(BasicNewsRecipe):
] ]
feeds = [ feeds = [
('sfbg', 'http://www.sfbg.com/rss.xml'), ('sfbg', 'http://www.sfbg.com/feed/'),
] ]

View File

@ -9,23 +9,9 @@ class Shacknews(BasicNewsRecipe):
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
language = 'en' language = 'en'
no_stylesheets = True no_stylesheets = True
remove_tags = [dict(name='div', attrs={'class': ['nuggets', 'comments']}), auto_cleanup = True
dict(name='p', attrs={'class': 'videoembed'})]
keep_only_tags = [dict(name='div', attrs={'class': 'story'})]
feeds = [ feeds = [
(u'Latest News', u'http://feed.shacknews.com/shackfeed.xml'), (u'Latest News', u'http://www.shacknews.com/shackfeed.xml'),
(u'PC', u'http://feed.shacknews.com/extras/tag_rss.x/PC'),
(u'Wii', u'http://feed.shacknews.com/extras/tag_rss.x/Nintendo+Wii'),
(u'Xbox 360', u'http://feed.shacknews.com/extras/tag_rss.x/Xbox+360'),
(u'Playstation 3',
u'http://feed.shacknews.com/extras/tag_rss.x/PlayStation+3'),
(u'PSP', u'http://feed.shacknews.com/extras/tag_rss.x/PSP'),
(u'Nintendo DS', u'http://feed.shacknews.com/extras/tag_rss.x/Nintendo+DS'),
(u'iPhone', u'http://feed.shacknews.com/extras/tag_rss.x/iPhone'),
(u'DLC', u'http://feed.shacknews.com/extras/tag_rss.x/DLC'),
(u'Valve', u'http://feed.shacknews.com/extras/tag_rss.x/Valve'),
(u'Electronic Arts',
u'http://feed.shacknews.com/extras/tag_rss.x/Electronic+Arts')
] ]

View File

@ -14,69 +14,19 @@ class Starbulletin(BasicNewsRecipe):
publisher = 'Honolulu Star-Advertiser' publisher = 'Honolulu Star-Advertiser'
category = 'news, Honolulu, Hawaii' category = 'news, Honolulu, Hawaii'
oldest_article = 2 oldest_article = 2
needs_subscription = True
max_articles_per_feed = 100 max_articles_per_feed = 100
language = 'en' language = 'en'
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'utf8' encoding = 'utf8'
publication_type = 'newspaper' publication_type = 'newspaper'
masthead_url = 'http://media.staradvertiser.com/designimages/star-advertiser-logo-small.gif' auto_cleanup = True
# extra_css = """
# body{font-family: Verdana,Arial,Helvetica,sans-serif}
# h1,.brown,.hsa_postCredit{color: #663300}
# .storyDeck{font-size: 1.2em; font-weight: bold}
# img{display: block}
# """
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True
}
keep_only_tags = [
dict(attrs={'id': 'hsa_storyTitle'}), dict(attrs={'id': 'hsa_storyTitle article-important'}), dict(attrs={'class': ['hsa_dateStamp', 'hsa_postCredit', 'storyDeck']}), dict(name='span', attrs={'class': ['hsa_dateStamp', 'hsa_postCredit']}), dict(name='span', attrs={'class': ['hsa_dateStamp article-important', 'hsa_postCredit article-important']}), dict(name='div', attrs={'class': 'storytext article-important'}), dict(name='div', attrs={'class': 'storytext'}) # noqa
]
remove_tags = [
# removed 'span' from preceding list to permit keeping of author and
dict(name=['object', 'link', 'script', 'meta',
'base', 'iframe']) # timestamp
, dict(attrs={'class': ['insideStoryImage', 'insideStoryAd']}), dict(attrs={'name': 'fb_share'})
]
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
br.open('http://www.staradvertiser.com/manage/Login/')
br.select_form(name='loginForm')
br['email'] = self.username
br['password'] = self.password
br.submit()
return br
feeds = [ feeds = [
(u'Breaking News',
(u'Breaking News', u'http://www.staradvertiser.com/news/breaking/index.rss'), u'http://www.staradvertiser.com/category/breaking-news/feed/'),
(u'News', u'http://www.staradvertiser.com/newspremium/index.rss'), (u'Business', u'http://www.staradvertiser.com/business/feed/'),
(u'Business', u'http://www.staradvertiser.com/businesspremium/index.rss'), (u'Sports', u'http://www.staradvertiser.com/sports/feed/'),
(u'Sports', u'http://www.staradvertiser.com/sportspremium/index.rss'), (u'Features',
(u'Features', u'http://www.staradvertiser.com/featurespremium/index.rss') u'http://www.staradvertiser.com/featurespremium/index.rss')
] ]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('a'):
limg = item.find('img')
if item.string is not None:
str = item.string
item.replaceWith(str)
else:
if limg:
item.name = 'div'
item.attrs = []
else:
str = self.tag_to_string(item)
item.replaceWith(str)
for item in soup.findAll('img'):
if not item.has_key('alt'): # noqa
item['alt'] = 'image'
return soup

View File

@ -10,17 +10,16 @@ http://www.techworld.com/
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
class techworld(BasicNewsRecipe): class techworld(BasicNewsRecipe):
__author__ = 'Lorenzo Vigentini' __author__ = 'Lorenzo Vigentini'
description = 'Techworld offers the latest breaking IT industry news, product reviews, enterprise software downloads, how-to articles and expert blogs for technical professionals and enterprise users in the UK' # noqa description = 'Techworld offers the latest breaking IT industry news, product reviews, enterprise software downloads, how-to articles and expert blogs for technical professionals and enterprise users in the UK' # noqa
cover_url = 'http://www.techworld.com/graphics/header/site_logo.jpg'
title = 'TechWorld' title = 'TechWorld'
publisher = 'IDG Communication' publisher = 'IDG Communication'
category = 'Apple, Mac, video, computing, product reviews, editing, cameras, production' category = ('Apple, Mac, video, computing, product reviews, '
'editing, cameras, production')
language = 'en' language = 'en'
timefmt = '[%a, %d %b, %Y]' timefmt = '[%a, %d %b, %Y]'
@ -32,60 +31,16 @@ class techworld(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True
auto_cleanup = True
temp_files = []
articles_are_obfuscated = True
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url)
response = br.follow_link(url_regex='?getDynamicPage&print$', nr=0)
html = response.read()
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name
keep_only_tags = [
dict(name='div', attrs={'id': 'articleBody'}),
dict(name='h2', attrs={'class': 'blogTitle'}),
dict(name='h3', attrs={'class': 'blogger'}),
]
remove_tags = [
dict(name='div', attrs={'class': ['submissionBar', 'mpuContainer']}),
dict(name='div', attrs={'id': [
'breadcrumb', 'mainContentSidebar', 'articleIconsList', 'loginSubscribeBoxout']}),
dict(name='ul', attrs={'class': 'articleIconsList'})
]
remove_tags_after = [
dict(name='div', attrs={'id': 'articleFooter'})
]
feeds = [ feeds = [
(u'News', u'http://www.techworld.com/rss/feeds/techworld-news.xml'), (u'News', u'http://www.techworld.com/news/rss'),
(u'How-Tos', u'http://www.techworld.com/rss/feeds/techworld-how-tos.xml'), (u'Tutorial', u'http://www.techworld.com/tutorial/rss'),
(u'Reviews', u'http://www.techworld.com/rss/feeds/techworld-reviews.xml'), (u'Reviews', u'http://www.techworld.com/review/rss'),
(u'Features', u'http://www.techworld.com/rss/feeds/techworld-features.xml'), (u'Features', u'http://www.techworld.com/features/rss'),
(u'Storage', u'http://www.techworld.com/rss/feeds/techworld-storage.xml'), (u'Analysis', u'http://www.techworld.com/analysis/rss'),
(u'Applications', (u'Galleries',
u'http://www.techworld.com/rss/feeds/techworld-applications.xml'), u'http://www.techworld.com/picture-gallery/rss'),
(u'Virtualization', (u'TechWorld Blogs',
u'http://www.techworld.com/rss/feeds/techworld-virtualisation.xml'), u'http://www.techworld.com/blog/rss'),
(u'Personal Tech',
u'http://www.techworld.com/rss/feeds/techworld-personal-tech.xml'),
(u'Green IT', u'http://www.techworld.com/rss/feeds/techworld-green-it.xml'),
(u'Security', u'http://www.techworld.com/rss/feeds/techworld-security.xml'),
(u'Operating Systems',
u'http://www.techworld.com/rss/feeds/techworld-operating-systems.xml'),
(u'Networking', u'http://www.techworld.com/rss/feeds/techworld-networking.xml'),
(u'Mobile and Wireless',
u'http://www.techworld.com/rss/feeds/techworld-mobile-wireless.xml'),
(u'Data Centre', u'http://www.techworld.com/rss/feeds/techworld-data-centre.xml'),
(u'SME', u'http://www.techworld.com/rss/feeds/techworld-sme.xml'),
(u'TechWorld Blogs', u'http://blogs.techworld.com/atom.xml')
] ]
extra_css = '''
img {align:left;}
'''

View File

@ -18,12 +18,14 @@ class TechnologyReview(BasicNewsRecipe):
.subheadline {font: italic large} .subheadline {font: italic large}
""" """
feeds = [ feeds = [
(u'Computing', u'http://feeds.technologyreview.com/technology_review_Computing'), (u'Computing',
(u'Web', u'http://feeds.technologyreview.com/technology_review_Web'), u'http://feeds.technologyreview.com/technology_review_Computing'),
(u'Communications', (u'Energy',
u'http://feeds.technologyreview.com/technology_review_Communications'), u'http://feeds.technologyreview.com/technology_review_Energy'),
(u'Energy', u'http://feeds.technologyreview.com/technology_review_Energy'), (u'Materials',
(u'Materials', u'http://feeds.technologyreview.com/technology_review_Materials'), u'http://feeds.technologyreview.com/technology_review_Materials'),
(u'Biomedicine', u'http://feeds.technologyreview.com/technology_review_Biotech'), (u'Biomedicine',
(u'Business', u'http://feeds.technologyreview.com/technology_review_Biztech') u'http://feeds.technologyreview.com/technology_review_Biotech'),
(u'Business',
u'http://feeds.technologyreview.com/technology_review_Biztech')
] ]

View File

@ -22,20 +22,8 @@ class TheBudgetFashionista(BasicNewsRecipe):
category = 'news, fashion, comsetics, women' category = 'news, fashion, comsetics, women'
lang = 'en-US' lang = 'en-US'
language = 'en' language = 'en'
auto_cleanup = True
conversion_options = { feeds = [(u'Articles',
'comment': description, 'tags': category, 'publisher': publisher, 'language': lang u'http://feeds.feedburner.com/TheBudgetFashionista')
} ]
keep_only_tags = [dict(name='div', attrs={'class': 'columnLeft'})]
remove_tags_after = dict(name='div', attrs={'class': 'postDetails'})
remove_tags = [dict(name=['object', 'link', 'script',
'iframe', 'form', 'login-button'])]
feeds = [(u'Articles', u'http://www.thebudgetfashionista.com/feeds/atom/')]
def preprocess_html(self, soup):
for it in soup.findAll('img'):
if it.parent.name == 'a':
it.parent.name = 'div'
return soup

View File

@ -23,8 +23,5 @@ class TheWeek(BasicNewsRecipe):
language = 'en' language = 'en'
auto_cleanup = True auto_cleanup = True
feeds = [ feeds = [
(u'News-Opinion', u'http://theweek.com/section/index/news_opinion.rss'), (u'Latest articles', u'http://theweek.com/rss.xml'),
(u'Business', u'http://theweek.com/section/index/business.rss'),
(u'Arts-Life', u'http://theweek.com/section/index/arts_life.rss'),
(u'Cartoons', u'http://theweek.com/section/index/cartoon_wit/0/all-cartoons.rss')
] ]

View File

@ -14,7 +14,6 @@ class USAToday(BasicNewsRecipe):
title = 'USA Today' title = 'USA Today'
__author__ = 'Kovid Goyal' __author__ = 'Kovid Goyal'
description = 'newspaper' description = 'newspaper'
cover_url = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg12/lg/USAT.jpg'
encoding = 'utf-8' encoding = 'utf-8'
publisher = 'usatoday.com' publisher = 'usatoday.com'
category = 'news, usa' category = 'news, usa'
@ -28,25 +27,42 @@ class USAToday(BasicNewsRecipe):
filterDuplicates = True filterDuplicates = True
extra_css = ''' extra_css = '''
h1, h2 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;} h1, h2 {
#post-attributes, .info, .clear {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;} font-size:xx-large;
#post-body, #content {font-size:medium; font-family:Arial,Helvetica,sans-serif;} font-family:Arial,Helvetica,sans-serif;}
#post-attributes, .info,
.clear {
font-size:xx-small; color:#4D4D4D;
font-family:Arial,Helvetica,sans-serif;
}
#post-body,
#content {
font-size:medium;
font-family:Arial,Helvetica,sans-serif;
}
''' '''
feeds = [ feeds = [
('Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'), ('Top Headlines',
('Tech Headlines', 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'), 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'),
('Personal Tech', 'http://rssfeeds.usatoday.com/UsatodaycomTech-PersonalTalk'), ('Tech Headlines',
('Science', 'http://rssfeeds.usatoday.com/TP-ScienceFair'), 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'),
('Health', 'http://rssfeeds.usatoday.com/UsatodaycomHealth-TopStories'), ('Personal Tech',
('Travel Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomTravel-TopStories'), 'http://rssfeeds.usatoday.com/UsatodaycomTech-PersonalTalk'),
('Money Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomMoney-TopStories'), ('Health',
'http://rssfeeds.usatoday.com/UsatodaycomHealth-TopStories'),
('Travel Headlines',
'http://rssfeeds.usatoday.com/UsatodaycomTravel-TopStories'),
('Money Headlines',
'http://rssfeeds.usatoday.com/UsatodaycomMoney-TopStories'),
('Entertainment Headlines', ('Entertainment Headlines',
'http://rssfeeds.usatoday.com/usatoday-LifeTopStories'), 'http://rssfeeds.usatoday.com/usatoday-LifeTopStories'),
('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'), ('Sport Headlines',
('Weather Headlines', 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'), 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'),
('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'), ('Weather Headlines',
('Offbeat News', 'http://rssfeeds.usatoday.com/UsatodaycomOffbeat-TopStories') 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'),
('Most Popular',
'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'),
] ]
auto_cleanup = True auto_cleanup = True

View File

@ -20,40 +20,14 @@ class LaPrensa(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
encoding = 'utf-8' encoding = 'utf-8'
language = 'en' language = 'en'
auto_cleanup = True
html2lrf_options = [
'--comment', description, '--category', category, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + \
'"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [
dict(name='h1'), dict(name='div', attrs={'id': ['dateline']}), dict(
name='div', attrs={'class': ['blogCredit', 'body']})
]
feeds = [ feeds = [
(u'Homepage', u'http://www.usnews.com/rss/usnews.rss'),
(u'Homepage', u'http://www.usnews.com/rss/usnews.rss'), (u'Health', u'http://www.usnews.com/rss/health'),
(u'Health', u'http://www.usnews.com/rss/health/index.rss'), (u'Nation & World', u'http://www.usnews.com/rss/news'),
(u'Nation & World', u'http://www.usnews.com/rss/news/index.rss'), (u'Money & Business', u'http://www.usnews.com/rss/money'),
(u'Money & Business', u'http://www.usnews.com/rss/business/index.rss'), (u'Education', u'http://www.usnews.com/rss/education'),
(u'Education', u'http://www.usnews.com/rss/education/index.rss'), (u'Opinion', u'http://www.usnews.com/rss/opinion'),
(u'Opinion', u'http://www.usnews.com/rss/opinion/index.rss'), (u'Science', u'http://www.usnews.com/rss/science')
(u'Science', u'http://www.usnews.com/rss/science/index.rss')
] ]
def print_version(self, url):
return url.replace('.html', '_print.html')
def get_article_url(self, article):
raw = article.get('link', None)
artcl, sep, unneeded = raw.rpartition('?')
return artcl
def preprocess_html(self, soup):
del soup.body['onload']
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -11,10 +11,10 @@ class AdvancedUserRecipe1278773519(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
feeds = [ feeds = [
(u'News', u'http://www.wacotrib.com/news/index.rss2'), (u'News', u'http://www.wacotrib.com/search/?q=&t=article&l=100&d=&d1=&d2=&s=start_time&sd=desc&nsa=eedition&c[]=news/ap_nation,news/ap_nation/*&f=rss'),
(u'Sports', u'http://www.wacotrib.com/sports/index.rss2'), (u'Sports', u'http://www.wacotrib.com/search/?q=&t=article&l=25&d=&d1=&d2=&s=start_time&sd=desc&c[]=sports*&f=rss'),
(u'AccessWaco', u'http://www.wacotrib.com/accesswaco/index.rss2'), (u'AccessWaco', u'http://www.wacotrib.com/search/?q=&t=article&l=25&d=&d1=&d2=&s=start_time&sd=desc&c[]=entertainment/accesswaco*&f=rss'),
(u'Opinions', u'http://www.wacotrib.com/opinion/index.rss2') (u'Opinions', u'http://www.wacotrib.com/search/?q=&t=article&l=25&d=&d1=&d2=&s=start_time&sd=desc&c[]=opinion*&f=rss')
] ]
remove_javascript = True remove_javascript = True
@ -23,13 +23,4 @@ class AdvancedUserRecipe1278773519(BasicNewsRecipe):
language = 'en' language = 'en'
encoding = 'utf-8' encoding = 'utf-8'
conversion_options = {'linearize_tables': True} conversion_options = {'linearize_tables': True}
masthead_url = 'http://media.wacotrib.com/designimages/wacotrib_logo.jpg' auto_cleanup = True
keep_only_tags = [
dict(name='div', attrs={'class': 'twoColumn left'}),
]
remove_tags = [
dict(name='div', attrs={'class': 'right blueLinks'}),
]
remove_tags_after = [
dict(name='div', attrs={'class': 'dottedRule'}),
]

View File

@ -4,7 +4,6 @@ __copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
www.washingtonpost.com www.washingtonpost.com
''' '''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -23,55 +22,18 @@ class TheWashingtonPost(BasicNewsRecipe):
language = 'en' language = 'en'
remove_empty_feeds = True remove_empty_feeds = True
publication_type = 'newspaper' publication_type = 'newspaper'
masthead_url = 'http://www.washingtonpost.com/rw/sites/twpweb/img/logos/twp_logo_300.gif' auto_cleanup = True
cover_url = strftime(
'http://www.washingtonpost.com/rw/WashingtonPost/Content/Epaper/%Y-%m-%d/Ax1.pdf')
extra_css = """
body{font-family: Georgia,serif }
"""
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
keep_only_tags = [
dict(attrs={'id': ['content', 'entryhead', 'entrytext']})]
remove_tags = [
dict(name=['meta', 'link', 'iframe', 'base']), dict(
attrs={'id': 'multimedia-leaf-page'})
]
remove_attributes = ['lang', 'property', 'epochtime',
'datetitle', 'pagetype', 'contenttype', 'comparetime']
feeds = [ feeds = [
(u'World', u'http://feeds.washingtonpost.com/rss/world'),
(u'World', u'http://feeds.washingtonpost.com/rss/world'), (u'National', u'http://feeds.washingtonpost.com/rss/national'),
(u'National', u'http://feeds.washingtonpost.com/rss/national'), (u'White House',
(u'White House', u'http://feeds.washingtonpost.com/rss/politics/whitehouse'), u'http://feeds.washingtonpost.com/rss/politics/whitehouse'),
(u'Business', u'http://feeds.washingtonpost.com/rss/business'), (u'Business', u'http://feeds.washingtonpost.com/rss/business'),
(u'Opinions', u'http://feeds.washingtonpost.com/rss/opinions'), (u'Opinions', u'http://feeds.washingtonpost.com/rss/opinions'),
(u'Investigations', u'http://feeds.washingtonpost.com/rss/investigations'), (u'Local', u'http://feeds.washingtonpost.com/rss/local'),
(u'Local', u'http://feeds.washingtonpost.com/rss/local'), (u'Entertainment',
(u'Entertainment', u'http://feeds.washingtonpost.com/rss/entertainment'), u'http://feeds.washingtonpost.com/rss/entertainment'),
(u'Sports', u'http://feeds.washingtonpost.com/rss/sports'), (u'Sports', u'http://feeds.washingtonpost.com/rss/sports'),
(u'Redskins', u'http://feeds.washingtonpost.com/rss/sports/redskins'), (u'Redskins', u'http://feeds.washingtonpost.com/rss/sports/redskins'),
(u'Special Reports', u'http://feeds.washingtonpost.com/rss/national/special-reports')
] ]
def print_version(self, url):
if '_story.html' in url:
return url.replace('_story.html', '_print.html')
return url
def get_article_url(self, article):
link = BasicNewsRecipe.get_article_url(self, article)
if article.id.startswith('http'):
link = article.id
if 'washingtonpost.com' not in link:
self.log('Skipping ads:', link)
return None
for it in ['_video.html', '_gallery.html', '_links.html']:
if it in link:
self.log('Skipping non-article:', link)
return None
return link

View File

@ -14,5 +14,5 @@ class Worldcrunch(BasicNewsRecipe):
feeds = [ feeds = [
('News', ('News',
'http://www.worldcrunch.com/feed'), 'http://www.worldcrunch.com/rss/rss.php'),
] ]