mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge branch 'master' of https://github.com/CoderAllan/calibre
This commit is contained in:
commit
3def2109c0
@ -19,54 +19,14 @@ class Moscowtimes(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
remove_empty_feeds = True
|
||||
encoding = 'cp1251'
|
||||
masthead_url = 'http://www.themoscowtimes.com/bitrix/templates/tmt/img/logo.gif'
|
||||
publication_type = 'newspaper'
|
||||
auto_cleanup = True
|
||||
|
||||
conversion_options = {
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
|
||||
}
|
||||
|
||||
extra_css = '''
|
||||
h1{ color:#0066B3; font-family: Georgia,serif ; font-size: large}
|
||||
.article_date{ font-family:Arial,Tahoma,Verdana,Helvetica,sans-serif ; color:#000000; font-size: x-small;}
|
||||
.autors{color:#999999 ; font-weight: bold ; font-family:Arial,Tahoma,Verdana,Helvetica,sans-serif ; font-size: x-small; }
|
||||
.photoautors{ color:#999999 ; font-family:Arial,Tahoma,Verdana,Helvetica,sans-serif ; font-size: x-small; }
|
||||
.text{font-family:Arial,Tahoma,Verdana,Helvetica,sans-serif ; font-size:75%; }
|
||||
'''
|
||||
feeds = [
|
||||
|
||||
(u'Top Stories', u'http://www.themoscowtimes.com/rss/top'),
|
||||
(u'Current Issue', u'http://www.themoscowtimes.com/rss/issue'),
|
||||
(u'News', u'http://www.themoscowtimes.com/rss/news'),
|
||||
(u'Business', u'http://www.themoscowtimes.com/rss/business'),
|
||||
(u'Art and Ideas', u'http://www.themoscowtimes.com/rss/art'),
|
||||
(u'Opinion', u'http://www.themoscowtimes.com/rss/opinion')
|
||||
(u'Top Stories', u'https://themoscowtimes.com/feeds/main.xml'),
|
||||
(u'Moscow', u'https://themoscowtimes.com/feeds/moscow.xml'),
|
||||
(u'Russia', u'https://themoscowtimes.com/feeds/russia.xml'),
|
||||
(u'World', u'https://themoscowtimes.com/feeds/world.xml'),
|
||||
(u'Business', u'https://themoscowtimes.com/feeds/business.xml'),
|
||||
(u'Opinion', u'https://themoscowtimes.com/feeds/opinion.xml')
|
||||
]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id': 'content'})]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class': ['photo_nav', 'phototext']}), dict(
|
||||
name=['iframe', 'meta', 'base', 'link', 'embed', 'object'])
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for lnk in soup.findAll('a'):
|
||||
if lnk.string is not None:
|
||||
ind = self.tag_to_string(lnk)
|
||||
lnk.replaceWith(ind)
|
||||
return soup
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('.themoscowtimes.com/', '.themoscowtimes.com/print/')
|
||||
|
||||
def get_cover_url(self):
|
||||
cover_url = None
|
||||
href = 'http://www.themoscowtimes.com/pdf/'
|
||||
soup = self.index_to_soup(href)
|
||||
div = soup.find('div', attrs={'class': 'left'})
|
||||
if div:
|
||||
a = div.find('a')
|
||||
if a:
|
||||
cover_url = 'http://www.themoscowtimes.com' + a.img['src']
|
||||
return cover_url
|
||||
|
@ -1,4 +1,3 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
@ -11,7 +10,8 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class Newstraitstimes(BasicNewsRecipe):
|
||||
title = 'New Straits Times from Malaysia'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Learning Curve, Sunday People, New Straits Times from Malaysia'
|
||||
description = ('Learning Curve, Sunday People, '
|
||||
'New Straits Times from Malaysia')
|
||||
publisher = 'nst.com.my'
|
||||
category = 'news, politics, Malaysia'
|
||||
oldest_article = 2
|
||||
@ -20,13 +20,6 @@ class Newstraitstimes(BasicNewsRecipe):
|
||||
encoding = 'cp1252'
|
||||
use_embedded_content = False
|
||||
language = 'en'
|
||||
masthead_url = 'http://www.nst.com.my/Current_News/NST/Images/new-nstonline.jpg'
|
||||
auto_cleanup = True
|
||||
|
||||
conversion_options = {
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
|
||||
}
|
||||
|
||||
remove_tags = [dict(name=['link', 'table'])]
|
||||
keep_only_tags = dict(name='div', attrs={'id': 'haidah'})
|
||||
|
||||
feeds = [(u'Articles', u'http://www.nst.com.my/rss/allSec')]
|
||||
feeds = [(u'Articles', u'http://www.nst.com.my/latest.xml')]
|
||||
|
@ -19,14 +19,6 @@ class OldNewThing(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
publication_type = 'blog'
|
||||
extra_css = ' body{font-family: Verdana,Arial,Helvetica,sans-serif} .code{font-family: "Lucida Console",monospace} '
|
||||
auto_cleanup = True
|
||||
|
||||
conversion_options = {
|
||||
'comment': description, 'tags': 'blog, windows, microsoft, programming', 'publisher': 'Raymond Chen', 'language': language
|
||||
}
|
||||
|
||||
remove_attributes = ['width', 'height']
|
||||
keep_only_tags = [dict(attrs={'class': 'full-post'})]
|
||||
remove_tags = [
|
||||
dict(attrs={'class': ['post-attributes', 'post-tags', 'post-actions']})]
|
||||
feeds = [(u'Posts', u'http://blogs.msdn.com/oldnewthing/rss.xml')]
|
||||
feeds = [(u'Posts', u'https://blogs.msdn.microsoft.com/oldnewthing/feed')]
|
||||
|
@ -32,56 +32,16 @@ class pcAdvisor(BasicNewsRecipe):
|
||||
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id': 'articlecontent'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id': ['crosssitesignup', 'submitarticle', 'dontPrint',
|
||||
'commentsForm', 'userReviewFormContainer', 'reevooContainerId']}),
|
||||
dict(name='div', attrs={'class': 'mpu'}),
|
||||
dict(name='p', attrs={'id': 'articlePageList'}),
|
||||
dict(name='div', attrs={
|
||||
'style': ['margin: 0pt 10px 5px;', 'margin: 0pt 10px 5px;']}),
|
||||
dict(name='p', attrs={'class': 'dontPrint'}),
|
||||
dict(name='h2', attrs={'class': 'sectionTitle'}),
|
||||
dict(name='a', attrs={'title': 'Subscribe to PC Advisor'}),
|
||||
dict(name='a', attrs={'name': 'revooContent'}),
|
||||
{'name': ['form', 'script', 'link']}
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='p', attrs={'id': 'crosssitesignup'})
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
return article.get('guid', None)
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [
|
||||
(u'News Headlines', u'http://www.pcadvisor.co.uk/rss/feeds/pcanews.xml'),
|
||||
(u'Reviews', u'http://www.pcadvisor.co.uk/rss/feeds/pcareviews.xml'),
|
||||
(u'New Products',
|
||||
u'http://www.pcadvisor.co.uk/rss/feeds/blog18.xml'),
|
||||
(u'PC Advisor Blog',
|
||||
u'http://www.pcadvisor.co.uk/rss/feeds/blog4.xml'),
|
||||
(u'PC Security',
|
||||
u'http://www.pcadvisor.co.uk/rss/feeds/pca-security.xml'),
|
||||
(u'Laptops', u'http://www.pcadvisor.co.uk/rss/feeds/pca-laptop.xml'),
|
||||
(u'Green Computing',
|
||||
u'http://www.pcadvisor.co.uk/rss/feeds/pca-green-computing.xml'),
|
||||
(u'Internet and broadband',
|
||||
u'http://www.pcadvisor.co.uk/rss/feeds/pca-internet.xml'),
|
||||
(u'Prones and PDAs',
|
||||
u'http://www.pcadvisor.co.uk/rss/feeds/pca-phones.xml'),
|
||||
(u'Software', u'http://www.pcadvisor.co.uk/rss/feeds/pca-software.xml'),
|
||||
(u'Small Business',
|
||||
u'http://www.pcadvisor.co.uk/rss/feeds/pca-small-business.xml'),
|
||||
(u'Photo and video',
|
||||
u'http://www.pcadvisor.co.uk/rss/feeds/pca-photo-video.xml'),
|
||||
(u'Mac News', u'http://www.pcadvisor.co.uk/rss/feeds/pca-mac.xml'),
|
||||
(u'Linux', u'http://www.pcadvisor.co.uk/rss/feeds/pca-linux.xml'),
|
||||
(u'WiFi and Networking',
|
||||
u'http://www.pcadvisor.co.uk/rss/feeds/pca-networking.xml'),
|
||||
(u'Gadgets', u'http://www.pcadvisor.co.uk/rss/feeds/pca-gadgets.xml')
|
||||
(u'Latest', u'http://www.pcadvisor.co.uk/latest/rss'),
|
||||
(u'News', u'http://www.pcadvisor.co.uk/news/rss'),
|
||||
(u'How-tos', u'http://www.pcadvisor.co.uk/how-to/rss'),
|
||||
(u'Reviews', u'http://www.pcadvisor.co.uk/review/rss'),
|
||||
(u'Video Content', u'http://www.pcadvisor.co.uk/video/rss'),
|
||||
(u'iPhone', u'http://www.pcadvisor.co.uk/latest/iphone/rss'),
|
||||
(u'iPad', u'http://www.pcadvisor.co.uk/latest/ipad/rss'),
|
||||
(u'Mac', u'http://www.pcadvisor.co.uk/latest/mac/rss'),
|
||||
(u'Apple', u'http://www.pcadvisor.co.uk/latest/apple/rss'),
|
||||
]
|
||||
|
@ -19,8 +19,6 @@ class HindustanTimes(BasicNewsRecipe):
|
||||
'http://phys.org/rss-feed/physics-news/'),
|
||||
('Space and Earth',
|
||||
'http://phys.org/rss-feed/space-news/'),
|
||||
('Electronics',
|
||||
'http://phys.org/rss-feed/electronics-news/'),
|
||||
('Chemistry',
|
||||
'http://phys.org/rss-feed/chemistry-news/'),
|
||||
('Biology',
|
||||
|
@ -22,14 +22,17 @@ class Politiken_dk(BasicNewsRecipe):
|
||||
encoding = 'cp1252'
|
||||
language = 'da'
|
||||
|
||||
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } h1{font-family: Georgia,"Times New Roman",Times,serif } '
|
||||
extra_css = (' body{font-family: Arial,Helvetica,sans-serif } '
|
||||
'h1{font-family: Georgia,"Times New Roman",Times,serif } ')
|
||||
|
||||
conversion_options = {
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
|
||||
'comment': description,
|
||||
'tags': category,
|
||||
'publisher': publisher,
|
||||
'language': language
|
||||
}
|
||||
|
||||
feeds = [
|
||||
|
||||
(u'Tophistorier', u'http://politiken.dk/rss/tophistorier.rss'),
|
||||
(u'Seneste nyt', u'http://politiken.dk/rss/senestenyt.rss'),
|
||||
(u'Mest laeste', u'http://politiken.dk/rss/mestlaeste.rss'),
|
||||
@ -41,7 +44,6 @@ class Politiken_dk(BasicNewsRecipe):
|
||||
(u'Kultur', u'http://politiken.dk/rss/kultur.rss'),
|
||||
(u'Sport', u'http://politiken.dk/rss/sport.rss'),
|
||||
(u'Uddannelse', u'http://politiken.dk/rss/uddannelse.rss'),
|
||||
(u'Videnskab', u'http://politiken.dk/rss/videnskab.rss')
|
||||
]
|
||||
remove_tags_before = dict(name='h1')
|
||||
remove_tags = [
|
||||
|
@ -4,7 +4,6 @@ __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
rollingstone.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
@ -22,44 +21,8 @@ class RollingStone(BasicNewsRecipe):
|
||||
language = 'en'
|
||||
remove_empty_feeds = True
|
||||
publication_type = 'magazine'
|
||||
masthead_url = 'http://www.rollingstone.com/templates/rolling-stone-templates/theme/rstheme/images/rsLogo.png'
|
||||
extra_css = """
|
||||
body{font-family: Georgia,Times,serif }
|
||||
img{margin-bottom: 0.4em; display:block}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
|
||||
}
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'xml:lang="en">.*?<head>', re.DOTALL | re.IGNORECASE), lambda match: 'xml:lang="en">\n<head>\n'), (re.compile(
|
||||
r'</title>.*?</head>', re.DOTALL | re.IGNORECASE), lambda match: '</title>\n</head>\n')
|
||||
]
|
||||
|
||||
keep_only_tags = [
|
||||
dict(attrs={'class': ['headerImgHolder', 'headerContent']}), dict(name='div', attrs={'id': [
|
||||
'teaser', 'storyTextContainer']}), dict(name='div', attrs={'class': 'blogDetailModule clearfix'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['meta', 'iframe', 'object', 'embed']), dict(
|
||||
attrs={'id': 'mpStoryHeader'}), dict(attrs={'class': 'relatedTopics'})
|
||||
]
|
||||
remove_attributes = ['lang', 'onclick', 'width', 'height', 'name']
|
||||
remove_tags_before = dict(attrs={'class': 'bloggerInfo'})
|
||||
remove_tags_after = dict(attrs={'class': 'relatedTopics'})
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [
|
||||
|
||||
(u'All News', u'http://www.rollingstone.com/siteServices/rss/allNews'),
|
||||
(u'All Blogs', u'http://www.rollingstone.com/siteServices/rss/allBlogs'),
|
||||
(u'Movie Reviews', u'http://www.rollingstone.com/siteServices/rss/movieReviews'),
|
||||
(u'Album Reviews', u'http://www.rollingstone.com/siteServices/rss/albumReviews'),
|
||||
(u'Song Reviews', u'http://www.rollingstone.com/siteServices/rss/songReviews')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
@ -1,45 +0,0 @@
|
||||
#!/usr/bin/env python2
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Tony Stegall'
|
||||
__copyright__ = '2010, Tony Stegall or Tonythebookworm on mobileread.com'
|
||||
__version__ = 'v1.01'
|
||||
__date__ = '07, October 2010'
|
||||
__description__ = 'Rolling Stones Mag'
|
||||
|
||||
'''
|
||||
http://www.rollingstone.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
class RollingStones(BasicNewsRecipe):
|
||||
__author__ = 'Tony Stegall'
|
||||
description = 'Rolling Stones Mag'
|
||||
cover_url = 'http://gallery.celebritypro.com/data/media/648/kid-rock-rolling-stone-cover.jpg'
|
||||
masthead_url = 'http://origin.myfonts.com/s/ec/cc-200804/Rolling_Stone-logo.gif'
|
||||
|
||||
title = 'Rolling Stones Mag'
|
||||
category = 'Music Reviews, Movie Reviews, entertainment news'
|
||||
|
||||
language = 'en'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
|
||||
oldest_article = 15
|
||||
max_articles_per_feed = 25
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [
|
||||
(u'News', u'http://www.rollingstone.com/siteServices/rss/allNews'),
|
||||
(u'Blogs', u'http://www.rollingstone.com/siteServices/rss/allBlogs'),
|
||||
(u'Movie Reviews', u'http://www.rollingstone.com/siteServices/rss/movieReviews'),
|
||||
(u'Album Reviews', u'http://www.rollingstone.com/siteServices/rss/albumReviews'),
|
||||
(u'Song Reviews', u'http://www.rollingstone.com/siteServices/rss/songReviews'),
|
||||
|
||||
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?print=true'
|
@ -14,6 +14,5 @@ class SanFranciscoBayGuardian(BasicNewsRecipe):
|
||||
]
|
||||
|
||||
feeds = [
|
||||
('sfbg', 'http://www.sfbg.com/rss.xml'),
|
||||
('sfbg', 'http://www.sfbg.com/feed/'),
|
||||
]
|
||||
|
||||
|
@ -9,23 +9,9 @@ class Shacknews(BasicNewsRecipe):
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
language = 'en'
|
||||
|
||||
no_stylesheets = True
|
||||
remove_tags = [dict(name='div', attrs={'class': ['nuggets', 'comments']}),
|
||||
dict(name='p', attrs={'class': 'videoembed'})]
|
||||
keep_only_tags = [dict(name='div', attrs={'class': 'story'})]
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [
|
||||
(u'Latest News', u'http://feed.shacknews.com/shackfeed.xml'),
|
||||
(u'PC', u'http://feed.shacknews.com/extras/tag_rss.x/PC'),
|
||||
(u'Wii', u'http://feed.shacknews.com/extras/tag_rss.x/Nintendo+Wii'),
|
||||
(u'Xbox 360', u'http://feed.shacknews.com/extras/tag_rss.x/Xbox+360'),
|
||||
(u'Playstation 3',
|
||||
u'http://feed.shacknews.com/extras/tag_rss.x/PlayStation+3'),
|
||||
(u'PSP', u'http://feed.shacknews.com/extras/tag_rss.x/PSP'),
|
||||
(u'Nintendo DS', u'http://feed.shacknews.com/extras/tag_rss.x/Nintendo+DS'),
|
||||
(u'iPhone', u'http://feed.shacknews.com/extras/tag_rss.x/iPhone'),
|
||||
(u'DLC', u'http://feed.shacknews.com/extras/tag_rss.x/DLC'),
|
||||
(u'Valve', u'http://feed.shacknews.com/extras/tag_rss.x/Valve'),
|
||||
(u'Electronic Arts',
|
||||
u'http://feed.shacknews.com/extras/tag_rss.x/Electronic+Arts')
|
||||
(u'Latest News', u'http://www.shacknews.com/shackfeed.xml'),
|
||||
]
|
||||
|
@ -14,69 +14,19 @@ class Starbulletin(BasicNewsRecipe):
|
||||
publisher = 'Honolulu Star-Advertiser'
|
||||
category = 'news, Honolulu, Hawaii'
|
||||
oldest_article = 2
|
||||
needs_subscription = True
|
||||
max_articles_per_feed = 100
|
||||
language = 'en'
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'utf8'
|
||||
publication_type = 'newspaper'
|
||||
masthead_url = 'http://media.staradvertiser.com/designimages/star-advertiser-logo-small.gif'
|
||||
# extra_css = """
|
||||
# body{font-family: Verdana,Arial,Helvetica,sans-serif}
|
||||
# h1,.brown,.hsa_postCredit{color: #663300}
|
||||
# .storyDeck{font-size: 1.2em; font-weight: bold}
|
||||
# img{display: block}
|
||||
# """
|
||||
|
||||
conversion_options = {
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True
|
||||
}
|
||||
keep_only_tags = [
|
||||
dict(attrs={'id': 'hsa_storyTitle'}), dict(attrs={'id': 'hsa_storyTitle article-important'}), dict(attrs={'class': ['hsa_dateStamp', 'hsa_postCredit', 'storyDeck']}), dict(name='span', attrs={'class': ['hsa_dateStamp', 'hsa_postCredit']}), dict(name='span', attrs={'class': ['hsa_dateStamp article-important', 'hsa_postCredit article-important']}), dict(name='div', attrs={'class': 'storytext article-important'}), dict(name='div', attrs={'class': 'storytext'}) # noqa
|
||||
]
|
||||
remove_tags = [
|
||||
# removed 'span' from preceding list to permit keeping of author and
|
||||
dict(name=['object', 'link', 'script', 'meta',
|
||||
'base', 'iframe']) # timestamp
|
||||
, dict(attrs={'class': ['insideStoryImage', 'insideStoryAd']}), dict(attrs={'name': 'fb_share'})
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser(self)
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://www.staradvertiser.com/manage/Login/')
|
||||
br.select_form(name='loginForm')
|
||||
br['email'] = self.username
|
||||
br['password'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [
|
||||
|
||||
(u'Breaking News', u'http://www.staradvertiser.com/news/breaking/index.rss'),
|
||||
(u'News', u'http://www.staradvertiser.com/newspremium/index.rss'),
|
||||
(u'Business', u'http://www.staradvertiser.com/businesspremium/index.rss'),
|
||||
(u'Sports', u'http://www.staradvertiser.com/sportspremium/index.rss'),
|
||||
(u'Features', u'http://www.staradvertiser.com/featurespremium/index.rss')
|
||||
(u'Breaking News',
|
||||
u'http://www.staradvertiser.com/category/breaking-news/feed/'),
|
||||
(u'Business', u'http://www.staradvertiser.com/business/feed/'),
|
||||
(u'Sports', u'http://www.staradvertiser.com/sports/feed/'),
|
||||
(u'Features',
|
||||
u'http://www.staradvertiser.com/featurespremium/index.rss')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
item.attrs = []
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
for item in soup.findAll('img'):
|
||||
if not item.has_key('alt'): # noqa
|
||||
item['alt'] = 'image'
|
||||
return soup
|
||||
|
@ -10,17 +10,16 @@ http://www.techworld.com/
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
|
||||
class techworld(BasicNewsRecipe):
|
||||
__author__ = 'Lorenzo Vigentini'
|
||||
description = 'Techworld offers the latest breaking IT industry news, product reviews, enterprise software downloads, how-to articles and expert blogs for technical professionals and enterprise users in the UK' # noqa
|
||||
cover_url = 'http://www.techworld.com/graphics/header/site_logo.jpg'
|
||||
|
||||
title = 'TechWorld'
|
||||
publisher = 'IDG Communication'
|
||||
category = 'Apple, Mac, video, computing, product reviews, editing, cameras, production'
|
||||
category = ('Apple, Mac, video, computing, product reviews, '
|
||||
'editing, cameras, production')
|
||||
|
||||
language = 'en'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
@ -32,60 +31,16 @@ class techworld(BasicNewsRecipe):
|
||||
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
temp_files = []
|
||||
articles_are_obfuscated = True
|
||||
|
||||
def get_obfuscated_article(self, url):
|
||||
br = self.get_browser()
|
||||
br.open(url)
|
||||
response = br.follow_link(url_regex='?getDynamicPage&print$', nr=0)
|
||||
html = response.read()
|
||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||
self.temp_files[-1].write(html)
|
||||
self.temp_files[-1].close()
|
||||
return self.temp_files[-1].name
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id': 'articleBody'}),
|
||||
dict(name='h2', attrs={'class': 'blogTitle'}),
|
||||
dict(name='h3', attrs={'class': 'blogger'}),
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class': ['submissionBar', 'mpuContainer']}),
|
||||
dict(name='div', attrs={'id': [
|
||||
'breadcrumb', 'mainContentSidebar', 'articleIconsList', 'loginSubscribeBoxout']}),
|
||||
dict(name='ul', attrs={'class': 'articleIconsList'})
|
||||
]
|
||||
remove_tags_after = [
|
||||
dict(name='div', attrs={'id': 'articleFooter'})
|
||||
]
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [
|
||||
(u'News', u'http://www.techworld.com/rss/feeds/techworld-news.xml'),
|
||||
(u'How-Tos', u'http://www.techworld.com/rss/feeds/techworld-how-tos.xml'),
|
||||
(u'Reviews', u'http://www.techworld.com/rss/feeds/techworld-reviews.xml'),
|
||||
(u'Features', u'http://www.techworld.com/rss/feeds/techworld-features.xml'),
|
||||
(u'Storage', u'http://www.techworld.com/rss/feeds/techworld-storage.xml'),
|
||||
(u'Applications',
|
||||
u'http://www.techworld.com/rss/feeds/techworld-applications.xml'),
|
||||
(u'Virtualization',
|
||||
u'http://www.techworld.com/rss/feeds/techworld-virtualisation.xml'),
|
||||
(u'Personal Tech',
|
||||
u'http://www.techworld.com/rss/feeds/techworld-personal-tech.xml'),
|
||||
(u'Green IT', u'http://www.techworld.com/rss/feeds/techworld-green-it.xml'),
|
||||
(u'Security', u'http://www.techworld.com/rss/feeds/techworld-security.xml'),
|
||||
(u'Operating Systems',
|
||||
u'http://www.techworld.com/rss/feeds/techworld-operating-systems.xml'),
|
||||
(u'Networking', u'http://www.techworld.com/rss/feeds/techworld-networking.xml'),
|
||||
(u'Mobile and Wireless',
|
||||
u'http://www.techworld.com/rss/feeds/techworld-mobile-wireless.xml'),
|
||||
(u'Data Centre', u'http://www.techworld.com/rss/feeds/techworld-data-centre.xml'),
|
||||
(u'SME', u'http://www.techworld.com/rss/feeds/techworld-sme.xml'),
|
||||
(u'TechWorld Blogs', u'http://blogs.techworld.com/atom.xml')
|
||||
(u'News', u'http://www.techworld.com/news/rss'),
|
||||
(u'Tutorial', u'http://www.techworld.com/tutorial/rss'),
|
||||
(u'Reviews', u'http://www.techworld.com/review/rss'),
|
||||
(u'Features', u'http://www.techworld.com/features/rss'),
|
||||
(u'Analysis', u'http://www.techworld.com/analysis/rss'),
|
||||
(u'Galleries',
|
||||
u'http://www.techworld.com/picture-gallery/rss'),
|
||||
(u'TechWorld Blogs',
|
||||
u'http://www.techworld.com/blog/rss'),
|
||||
]
|
||||
|
||||
extra_css = '''
|
||||
img {align:left;}
|
||||
'''
|
||||
|
@ -18,12 +18,14 @@ class TechnologyReview(BasicNewsRecipe):
|
||||
.subheadline {font: italic large}
|
||||
"""
|
||||
feeds = [
|
||||
(u'Computing', u'http://feeds.technologyreview.com/technology_review_Computing'),
|
||||
(u'Web', u'http://feeds.technologyreview.com/technology_review_Web'),
|
||||
(u'Communications',
|
||||
u'http://feeds.technologyreview.com/technology_review_Communications'),
|
||||
(u'Energy', u'http://feeds.technologyreview.com/technology_review_Energy'),
|
||||
(u'Materials', u'http://feeds.technologyreview.com/technology_review_Materials'),
|
||||
(u'Biomedicine', u'http://feeds.technologyreview.com/technology_review_Biotech'),
|
||||
(u'Business', u'http://feeds.technologyreview.com/technology_review_Biztech')
|
||||
(u'Computing',
|
||||
u'http://feeds.technologyreview.com/technology_review_Computing'),
|
||||
(u'Energy',
|
||||
u'http://feeds.technologyreview.com/technology_review_Energy'),
|
||||
(u'Materials',
|
||||
u'http://feeds.technologyreview.com/technology_review_Materials'),
|
||||
(u'Biomedicine',
|
||||
u'http://feeds.technologyreview.com/technology_review_Biotech'),
|
||||
(u'Business',
|
||||
u'http://feeds.technologyreview.com/technology_review_Biztech')
|
||||
]
|
||||
|
@ -22,20 +22,8 @@ class TheBudgetFashionista(BasicNewsRecipe):
|
||||
category = 'news, fashion, comsetics, women'
|
||||
lang = 'en-US'
|
||||
language = 'en'
|
||||
auto_cleanup = True
|
||||
|
||||
conversion_options = {
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': lang
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class': 'columnLeft'})]
|
||||
remove_tags_after = dict(name='div', attrs={'class': 'postDetails'})
|
||||
remove_tags = [dict(name=['object', 'link', 'script',
|
||||
'iframe', 'form', 'login-button'])]
|
||||
|
||||
feeds = [(u'Articles', u'http://www.thebudgetfashionista.com/feeds/atom/')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for it in soup.findAll('img'):
|
||||
if it.parent.name == 'a':
|
||||
it.parent.name = 'div'
|
||||
return soup
|
||||
feeds = [(u'Articles',
|
||||
u'http://feeds.feedburner.com/TheBudgetFashionista')
|
||||
]
|
||||
|
@ -23,8 +23,5 @@ class TheWeek(BasicNewsRecipe):
|
||||
language = 'en'
|
||||
auto_cleanup = True
|
||||
feeds = [
|
||||
(u'News-Opinion', u'http://theweek.com/section/index/news_opinion.rss'),
|
||||
(u'Business', u'http://theweek.com/section/index/business.rss'),
|
||||
(u'Arts-Life', u'http://theweek.com/section/index/arts_life.rss'),
|
||||
(u'Cartoons', u'http://theweek.com/section/index/cartoon_wit/0/all-cartoons.rss')
|
||||
(u'Latest articles', u'http://theweek.com/rss.xml'),
|
||||
]
|
||||
|
@ -14,7 +14,6 @@ class USAToday(BasicNewsRecipe):
|
||||
title = 'USA Today'
|
||||
__author__ = 'Kovid Goyal'
|
||||
description = 'newspaper'
|
||||
cover_url = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg12/lg/USAT.jpg'
|
||||
encoding = 'utf-8'
|
||||
publisher = 'usatoday.com'
|
||||
category = 'news, usa'
|
||||
@ -28,25 +27,42 @@ class USAToday(BasicNewsRecipe):
|
||||
filterDuplicates = True
|
||||
|
||||
extra_css = '''
|
||||
h1, h2 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
|
||||
#post-attributes, .info, .clear {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
|
||||
#post-body, #content {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
|
||||
h1, h2 {
|
||||
font-size:xx-large;
|
||||
font-family:Arial,Helvetica,sans-serif;}
|
||||
#post-attributes, .info,
|
||||
.clear {
|
||||
font-size:xx-small; color:#4D4D4D;
|
||||
font-family:Arial,Helvetica,sans-serif;
|
||||
}
|
||||
#post-body,
|
||||
#content {
|
||||
font-size:medium;
|
||||
font-family:Arial,Helvetica,sans-serif;
|
||||
}
|
||||
'''
|
||||
|
||||
feeds = [
|
||||
('Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'),
|
||||
('Tech Headlines', 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'),
|
||||
('Personal Tech', 'http://rssfeeds.usatoday.com/UsatodaycomTech-PersonalTalk'),
|
||||
('Science', 'http://rssfeeds.usatoday.com/TP-ScienceFair'),
|
||||
('Health', 'http://rssfeeds.usatoday.com/UsatodaycomHealth-TopStories'),
|
||||
('Travel Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomTravel-TopStories'),
|
||||
('Money Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomMoney-TopStories'),
|
||||
('Top Headlines',
|
||||
'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'),
|
||||
('Tech Headlines',
|
||||
'http://rssfeeds.usatoday.com/usatoday-TechTopStories'),
|
||||
('Personal Tech',
|
||||
'http://rssfeeds.usatoday.com/UsatodaycomTech-PersonalTalk'),
|
||||
('Health',
|
||||
'http://rssfeeds.usatoday.com/UsatodaycomHealth-TopStories'),
|
||||
('Travel Headlines',
|
||||
'http://rssfeeds.usatoday.com/UsatodaycomTravel-TopStories'),
|
||||
('Money Headlines',
|
||||
'http://rssfeeds.usatoday.com/UsatodaycomMoney-TopStories'),
|
||||
('Entertainment Headlines',
|
||||
'http://rssfeeds.usatoday.com/usatoday-LifeTopStories'),
|
||||
('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'),
|
||||
('Weather Headlines', 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'),
|
||||
('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'),
|
||||
('Offbeat News', 'http://rssfeeds.usatoday.com/UsatodaycomOffbeat-TopStories')
|
||||
('Sport Headlines',
|
||||
'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'),
|
||||
('Weather Headlines',
|
||||
'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'),
|
||||
('Most Popular',
|
||||
'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'),
|
||||
]
|
||||
|
||||
auto_cleanup = True
|
||||
|
@ -20,40 +20,14 @@ class LaPrensa(BasicNewsRecipe):
|
||||
use_embedded_content = False
|
||||
encoding = 'utf-8'
|
||||
language = 'en'
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment', description, '--category', category, '--publisher', publisher
|
||||
]
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + \
|
||||
'"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='h1'), dict(name='div', attrs={'id': ['dateline']}), dict(
|
||||
name='div', attrs={'class': ['blogCredit', 'body']})
|
||||
]
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [
|
||||
|
||||
(u'Homepage', u'http://www.usnews.com/rss/usnews.rss'),
|
||||
(u'Health', u'http://www.usnews.com/rss/health/index.rss'),
|
||||
(u'Nation & World', u'http://www.usnews.com/rss/news/index.rss'),
|
||||
(u'Money & Business', u'http://www.usnews.com/rss/business/index.rss'),
|
||||
(u'Education', u'http://www.usnews.com/rss/education/index.rss'),
|
||||
(u'Opinion', u'http://www.usnews.com/rss/opinion/index.rss'),
|
||||
(u'Science', u'http://www.usnews.com/rss/science/index.rss')
|
||||
(u'Health', u'http://www.usnews.com/rss/health'),
|
||||
(u'Nation & World', u'http://www.usnews.com/rss/news'),
|
||||
(u'Money & Business', u'http://www.usnews.com/rss/money'),
|
||||
(u'Education', u'http://www.usnews.com/rss/education'),
|
||||
(u'Opinion', u'http://www.usnews.com/rss/opinion'),
|
||||
(u'Science', u'http://www.usnews.com/rss/science')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('.html', '_print.html')
|
||||
|
||||
def get_article_url(self, article):
|
||||
raw = article.get('link', None)
|
||||
artcl, sep, unneeded = raw.rpartition('?')
|
||||
return artcl
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
del soup.body['onload']
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
@ -11,10 +11,10 @@ class AdvancedUserRecipe1278773519(BasicNewsRecipe):
|
||||
max_articles_per_feed = 100
|
||||
|
||||
feeds = [
|
||||
(u'News', u'http://www.wacotrib.com/news/index.rss2'),
|
||||
(u'Sports', u'http://www.wacotrib.com/sports/index.rss2'),
|
||||
(u'AccessWaco', u'http://www.wacotrib.com/accesswaco/index.rss2'),
|
||||
(u'Opinions', u'http://www.wacotrib.com/opinion/index.rss2')
|
||||
(u'News', u'http://www.wacotrib.com/search/?q=&t=article&l=100&d=&d1=&d2=&s=start_time&sd=desc&nsa=eedition&c[]=news/ap_nation,news/ap_nation/*&f=rss'),
|
||||
(u'Sports', u'http://www.wacotrib.com/search/?q=&t=article&l=25&d=&d1=&d2=&s=start_time&sd=desc&c[]=sports*&f=rss'),
|
||||
(u'AccessWaco', u'http://www.wacotrib.com/search/?q=&t=article&l=25&d=&d1=&d2=&s=start_time&sd=desc&c[]=entertainment/accesswaco*&f=rss'),
|
||||
(u'Opinions', u'http://www.wacotrib.com/search/?q=&t=article&l=25&d=&d1=&d2=&s=start_time&sd=desc&c[]=opinion*&f=rss')
|
||||
]
|
||||
|
||||
remove_javascript = True
|
||||
@ -23,13 +23,4 @@ class AdvancedUserRecipe1278773519(BasicNewsRecipe):
|
||||
language = 'en'
|
||||
encoding = 'utf-8'
|
||||
conversion_options = {'linearize_tables': True}
|
||||
masthead_url = 'http://media.wacotrib.com/designimages/wacotrib_logo.jpg'
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class': 'twoColumn left'}),
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class': 'right blueLinks'}),
|
||||
]
|
||||
remove_tags_after = [
|
||||
dict(name='div', attrs={'class': 'dottedRule'}),
|
||||
]
|
||||
auto_cleanup = True
|
||||
|
@ -4,7 +4,6 @@ __copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
www.washingtonpost.com
|
||||
'''
|
||||
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
@ -23,55 +22,18 @@ class TheWashingtonPost(BasicNewsRecipe):
|
||||
language = 'en'
|
||||
remove_empty_feeds = True
|
||||
publication_type = 'newspaper'
|
||||
masthead_url = 'http://www.washingtonpost.com/rw/sites/twpweb/img/logos/twp_logo_300.gif'
|
||||
cover_url = strftime(
|
||||
'http://www.washingtonpost.com/rw/WashingtonPost/Content/Epaper/%Y-%m-%d/Ax1.pdf')
|
||||
extra_css = """
|
||||
body{font-family: Georgia,serif }
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(attrs={'id': ['content', 'entryhead', 'entrytext']})]
|
||||
remove_tags = [
|
||||
dict(name=['meta', 'link', 'iframe', 'base']), dict(
|
||||
attrs={'id': 'multimedia-leaf-page'})
|
||||
]
|
||||
remove_attributes = ['lang', 'property', 'epochtime',
|
||||
'datetitle', 'pagetype', 'contenttype', 'comparetime']
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [
|
||||
|
||||
(u'World', u'http://feeds.washingtonpost.com/rss/world'),
|
||||
(u'National', u'http://feeds.washingtonpost.com/rss/national'),
|
||||
(u'White House', u'http://feeds.washingtonpost.com/rss/politics/whitehouse'),
|
||||
(u'White House',
|
||||
u'http://feeds.washingtonpost.com/rss/politics/whitehouse'),
|
||||
(u'Business', u'http://feeds.washingtonpost.com/rss/business'),
|
||||
(u'Opinions', u'http://feeds.washingtonpost.com/rss/opinions'),
|
||||
(u'Investigations', u'http://feeds.washingtonpost.com/rss/investigations'),
|
||||
(u'Local', u'http://feeds.washingtonpost.com/rss/local'),
|
||||
(u'Entertainment', u'http://feeds.washingtonpost.com/rss/entertainment'),
|
||||
(u'Entertainment',
|
||||
u'http://feeds.washingtonpost.com/rss/entertainment'),
|
||||
(u'Sports', u'http://feeds.washingtonpost.com/rss/sports'),
|
||||
(u'Redskins', u'http://feeds.washingtonpost.com/rss/sports/redskins'),
|
||||
(u'Special Reports', u'http://feeds.washingtonpost.com/rss/national/special-reports')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
if '_story.html' in url:
|
||||
return url.replace('_story.html', '_print.html')
|
||||
return url
|
||||
|
||||
def get_article_url(self, article):
|
||||
link = BasicNewsRecipe.get_article_url(self, article)
|
||||
if article.id.startswith('http'):
|
||||
link = article.id
|
||||
if 'washingtonpost.com' not in link:
|
||||
self.log('Skipping ads:', link)
|
||||
return None
|
||||
for it in ['_video.html', '_gallery.html', '_links.html']:
|
||||
if it in link:
|
||||
self.log('Skipping non-article:', link)
|
||||
return None
|
||||
return link
|
||||
|
@ -14,5 +14,5 @@ class Worldcrunch(BasicNewsRecipe):
|
||||
|
||||
feeds = [
|
||||
('News',
|
||||
'http://www.worldcrunch.com/feed'),
|
||||
'http://www.worldcrunch.com/rss/rss.php'),
|
||||
]
|
||||
|
Loading…
x
Reference in New Issue
Block a user