This commit is contained in:
Kovid Goyal 2016-12-03 11:57:34 +05:30
commit 96947fd01a
10 changed files with 54 additions and 665 deletions

View File

@ -18,35 +18,6 @@ class EdgeConversationRSS(BasicNewsRecipe):
oldest_article = 60
max_articles_per_feed = 100
no_stylesheets = True
auto_cleanup = True
keep_only_tags = [
dict(name='div', attrs={'class': 'HomeLeftPannel IMGCTRL'})]
remove_tags = [
dict(name='div', attrs={'class': 'Logo'})
]
feeds = [(u'Edge RSS', u'http://edge.org/feeds/')]
def print_version(self, url):
return url.replace('conversation/', 'conversation.php?cid=')
def parse_feeds(self):
# Call parent's method.
feeds = BasicNewsRecipe.parse_feeds(self)
# Loop through all feeds.
for feed in feeds:
# Loop through all articles in feed.
for article in feed.articles[:]:
# Remove anything that is not a conversation, and remove PDF
# files as well...
if not ('CONVERSATION' in article.title):
feed.articles.remove(article)
elif 'pdf' in article.url:
feed.articles.remove(article)
return feeds
feeds = [(u'Edge RSS', u'http://edge.org/feed')]

View File

@ -1,53 +0,0 @@
from calibre.web.feeds.news import BasicNewsRecipe
class EuropeanVoice(BasicNewsRecipe):
title = u'European Voice'
__author__ = 'malfi'
oldest_article = 14
max_articles_per_feed = 100
no_stylesheets = True
cover_url = 'http://www.europeanvoice.com/Css/images/logo.gif'
language = 'en'
keep_only_tags = [dict(name='div', attrs={'id': 'articleLeftColumn'})]
remove_tags = [dict(name='div', attrs={'id': 'BreadCrump'})]
feeds = [
(u'Whole site ', u'http://www.europeanvoice.com/Rss/2.xml'),
(u'News and analysis', u'http://www.europeanvoice.com/Rss/6.xml'),
(u'Comment', u'http://www.europeanvoice.com/Rss/7.xml'),
(u'Special reports', u'http://www.europeanvoice.com/Rss/5.xml'),
(u'People', u'http://www.europeanvoice.com/Rss/8.xml'),
(u'Career', u'http://www.europeanvoice.com/Rss/11.xml'),
(u'Policies', u'http://www.europeanvoice.com/Rss/4.xml'),
(u'EVents', u'http://www.europeanvoice.com/Rss/10.xml'),
(u'Policies - Economics', u'http://www.europeanvoice.com/Rss/31.xml'),
(u'Policies - Business', u'http://www.europeanvoice.com/Rss/19.xml'),
(u'Policies - Trade', u'http://www.europeanvoice.com/Rss/25.xml'),
(u'Policies - Information society',
u'http://www.europeanvoice.com/Rss/20.xml'),
(u'Policies - Energy', u'http://www.europeanvoice.com/Rss/15.xml'),
(u'Policies - Transport', u'http://www.europeanvoice.com/Rss/18.xml'),
(u'Policies - Climate change', u'http://www.europeanvoice.com/Rss/16.xml'),
(u'Policies - Environment', u'http://www.europeanvoice.com/Rss/17.xml'),
(u'Policies - Farming & food', u'http://www.europeanvoice.com/Rss/23.xml'),
(u'Policies - Health & society', u'http://www.europeanvoice.com/Rss/24.xml'),
(u'Policies - Justice', u'http://www.europeanvoice.com/Rss/29.xml'),
(u'Policies - Foreign affairs', u'http://www.europeanvoice.com/Rss/27.xml')
]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
def print_version(self, url):
return url + '?bPrint=1'
def preprocess_html(self, soup):
denied = soup.findAll(True, text='Subscribers')
if denied:
raise Exception(
'Article skipped, because content can only be seen with subscription')
return soup

View File

@ -15,29 +15,17 @@ class Kitsapsun(BasicNewsRecipe):
publisher = 'Scripps Interactive Newspapers Group'
category = 'news, Kitsap county, USA'
language = 'en'
oldest_article = 2
max_articles_per_feed = 100
oldest_article = 7
max_articles_per_feed = 50
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
auto_cleanup = True
conversion_options = {
'comments': description, 'tags': category, 'language': language, 'publisher': publisher
}
keep_only_tags = [
dict(name='div', attrs={'id': ['story_meta', 'story_content']})]
remove_tags = [dict(name=['object', 'link', 'embed', 'form', 'iframe'])]
feeds = [
(u'News', u'http://www.kitsapsun.com/rss/headlines/news/'),
(u'Business', u'http://www.kitsapsun.com/rss/headlines/business/'),
(u'Communities', u'http://www.kitsapsun.com/rss/headlines/communities/'),
(u'Entertainment', u'http://www.kitsapsun.com/rss/headlines/entertainment/'),
(u'Lifestyles', u'http://www.kitsapsun.com/rss/headlines/lifestyles/')
feeds = [(u'News', u'http://www.kitsapsun.com/feeds/rss/news'),
(u'Sports', u'http://www.kitsapsun.com/feeds/rss/sports'),
(u'Entertainment',
u'http://www.kitsapsun.com/feeds/rss/entertainment'),
(u'Lifestyles', u'http://www.kitsapsun.com/feeds/rss/lifestyle'),
(u'Opinion', u'http://www.kitsapsun.com/feeds/rss/opinion'),
]
def print_version(self, url):
return url.rpartition('/')[0] + '/?print=1'

View File

@ -14,5 +14,5 @@ class HindustanTimes(BasicNewsRecipe):
feeds = [
('News',
'http://www.mobilenations.com/rss/mb.xml'),
'http://www.mobilenations.com/about?format=RSS'),
]

View File

@ -1,8 +1,4 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import browser
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.BeautifulSoup import Tag
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
@ -31,383 +27,10 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
simultaneous_downloads = 20
use_embedded_content = False
recursions = 0
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
auto_cleanup = True
feeds = [
(u'NME News', u'http://www.nme.com/rss/news'),
(u'Reviews', u'http://www.nme.com/rss/reviews'),
(u'Blogs', u'http://www.nme.com/rss/blogs'),
(u'NME News', u'http://www.nme.com/news/feed'),
(u'Reviews', u'http://www.nme.com/reviews/feed/'),
(u'Blogs', u'http://www.nme.com/blogs/feed'),
]
keep_only_tags = [
dict(name='div', attrs={'id': 'content'}),
]
remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan',
'valign', 'vspace', 'hspace', 'alt', 'width', 'height']
remove_tags = [
dict(name='meta'),
dict(name='span', attrs={'class': 'article_info'}),
dict(name='div', attrs={'class': 'breadcrumbs'}),
dict(name='div', attrs={'class': 'mugshot'}),
dict(name='div', attrs={'class': 'header'}),
dict(name='div', attrs={'class': re.compile(
'youtube.*', re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile(
'socialbuttons.*', re.IGNORECASE)}),
dict(name='div', attrs={'class': 'clear_both'}),
dict(name='div', attrs={'class': re.compile(
'headline.*', re.IGNORECASE)}),
dict(name='div', attrs={'class': 'member-signedout'}),
dict(name='div', attrs={'class': re.compile(
'prev_next.*', re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile(
'article_related.*', re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile(
'feature_bar.*', re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile('ebay.*', re.IGNORECASE)}),
dict(name='div', attrs={'id': re.compile(
'morenews.*', re.IGNORECASE)}),
dict(name='div', attrs={'id': re.compile(
'ticketspopup.*', re.IGNORECASE)}),
dict(name='div', attrs={'id': re.compile(
'ratemy_logprompt.*', re.IGNORECASE)}),
dict(name='div', attrs={'id': re.compile(
'related_artist.*', re.IGNORECASE)}),
dict(name='img', attrs={'class': re.compile(
'video_play_large.*', re.IGNORECASE)}),
dict(name='ul', attrs={'class': re.compile(
'prev_next.*', re.IGNORECASE)}),
dict(name='ul', attrs={'class': re.compile(
'nme_store.*', re.IGNORECASE)}),
dict(name='p', attrs={'class': re.compile('top', re.IGNORECASE)}),
dict(name='table', attrs={
'class': re.compile('tickets.*', re.IGNORECASE)}),
]
masthead_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg'
def get_cover_url(self):
magazine_page_raw = self.index_to_soup(
'http://www.nme.com/magazine', raw=True)
magazine_page_raw = re.sub(
r'<script\b.+?</script>', '', magazine_page_raw, flags=re.DOTALL | re.IGNORECASE)
magazine_page_raw = re.sub(
r'\!\[if ', '!--[if ', magazine_page_raw, flags=re.DOTALL | re.IGNORECASE)
magazine_page = self.index_to_soup(magazine_page_raw)
cov = magazine_page.find('img', attrs={'class': 'magcover'})
cov2 = str(cov['src'])
br = browser()
br.set_handle_redirect(False)
try:
br.open_novisit(cov2)
cover_url = str(cov2)
except:
cover_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg'
return cover_url
def preprocess_raw_html(self, raw_html, url):
'''
Need this for a bug on site that prevents blogg post being parsed correctly
'''
raw_html = re.sub(r'\!\[if ', '!--[if ', raw_html,
flags=re.DOTALL | re.IGNORECASE)
return raw_html
def preprocess_html(self, soup):
youtube_regex = re.compile(
r'(?:youtube\.com/(?:[^/]+/.+/|(?:v|e(?:mbed)?)/|.*[?&]v=)|youtu\.be/)(?P<id>[^"&?/ ]{11})', re.DOTALL | re.IGNORECASE)
instagram_regex = re.compile(
r'.*?instagram.*?', re.DOTALL | re.IGNORECASE)
twitter_regex = re.compile(r'.*?twitter.*?', re.DOTALL | re.IGNORECASE)
visualise_regex = re.compile(
r'.*?visualise.*?', re.DOTALL | re.IGNORECASE)
soundcloud_regex = re.compile(
r'(?P<url>.*?(w|api)\.soundcloud.*?com\/(tracks|playlists)\/\d{8,9})', re.DOTALL | re.IGNORECASE)
dailymotion_regex = re.compile(
r'.*?dailymotion.*?', re.DOTALL | re.IGNORECASE)
spotify_regex = re.compile(r'.*?spotify.*?', re.DOTALL | re.IGNORECASE)
vine_regex = re.compile(r'.*?vine.*?', re.DOTALL | re.IGNORECASE)
doubleHtmlEntities = re.compile(
ur'(&amp;)(?P<e>[\d\w\#]*;)', re.DOTALL | re.IGNORECASE | re.UNICODE)
for iframe in soup.findAll('iframe'):
if iframe.has_key('src') and youtube_regex.search(iframe['src']) is not None: # noqa
pq = Tag(soup, 'blockquote')
br = Tag(soup, 'br')
pq.insert(0, '[ YouTube ]')
pq.insert(1, br)
m = youtube_regex.search(iframe['src'])
if m.group('id') is not None:
imgTag = Tag(soup, 'img', [
('src', 'http://img.youtube.com/vi/' + m.group('id') + '/0.jpg')])
pq.insert(len(pq.contents), imgTag)
pq.insert(len(pq.contents), iframe['src'])
iframe.replaceWith(pq)
elif iframe.has_key('src') and soundcloud_regex.search(iframe['src']) is not None: # noqa
m = soundcloud_regex.search(iframe['src'])
pq = Tag(soup, 'blockquote')
br = Tag(soup, 'br')
pq.insert(0, '[ SoundCloud ]')
pq.insert(1, br)
pq.insert(2, m.group('url'))
iframe.replaceWith(pq)
elif iframe.has_key('src') and dailymotion_regex.search(iframe['src']) is not None: # noqa
pq = Tag(soup, 'blockquote')
br = Tag(soup, 'br')
pq.insert(0, '[ DailyMotion ]')
pq.insert(1, br)
imgUrl = self.get_dailymotion_pic(iframe['src'])
if imgUrl is not None:
imgTag = Tag(soup, 'img', [('src', imgUrl)])
pq.insert(len(pq.contents), imgTag)
pq.insert(len(pq.contents), iframe['src'])
iframe.replaceWith(pq)
elif iframe.has_key('src') and spotify_regex.search(iframe['src']) is not None: # noqa
pq = Tag(soup, 'blockquote')
br = Tag(soup, 'br')
pq.insert(0, '[ Spotify ]')
pq.insert(1, br)
imgUrl = self.get_spotify_pic(iframe['src'])
if imgUrl is not None:
imgTag = Tag(soup, 'img', [('src', imgUrl)])
pq.insert(len(pq.contents), imgTag)
pq.insert(len(pq.contents), iframe['src'])
iframe.replaceWith(pq)
elif iframe.has_key('src') and vine_regex.search(iframe['src']) is not None: # noqa
pq = Tag(soup, 'blockquote')
br = Tag(soup, 'br')
pq.insert(0, '[ Vine ]')
pq.insert(1, br)
imgUrl = self.get_vine_pic(iframe['src'])
if imgUrl is not None:
imgTag = Tag(soup, 'img', [('src', imgUrl)])
pq.insert(len(pq.contents), imgTag)
pq.insert(len(pq.contents), iframe['src'])
iframe.replaceWith(pq)
elif iframe.has_key('src') and visualise_regex.search(iframe['src']) is not None: # noqa
imgUrl = self.get_visualise_pic(iframe['src'])
if imgUrl is not None:
imgTag = Tag(soup, 'img', [('src', imgUrl)])
iframe.replaceWith(imgTag)
for blockquote in soup.findAll('blockquote'):
if blockquote.has_key('class') and twitter_regex.search(blockquote['class']) is not None: # noqa
pq = Tag(soup, 'blockquote')
br = Tag(soup, 'br')
pq.insert(0, '[ Twitter ]')
pq.insert(len(pq.contents), br)
match = re.search(
"(?P<url>pic\.twitter[^\s<]+)", str(blockquote))
if match is not None:
img = self.get_twitter_pic(str(match.group("url")))
if img is not None:
pq.insert(len(pq.contents), img)
for p in blockquote.findAll(name='p'):
x = 0
plen = len(p.contents)
while True:
c = len(pq.contents)
if p.contents[x].string is not None:
pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE))
else:
pq.insert(c, p.contents[x].content)
x += 1
if x == plen:
break
br = Tag(soup, 'br')
pq.insert(len(pq.contents), br)
p.extract()
if len(blockquote.contents) > 0:
x = 0
xlen = len(blockquote.contents)
while True:
c = len(pq.contents)
if blockquote.contents[x].string is not None:
pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
2), str(blockquote.contents[x].string), re.IGNORECASE | re.UNICODE))
else:
pq.insert(c, blockquote.contents[x].content)
x += 1
if x == xlen:
break
blockquote.replaceWith(pq)
elif blockquote.has_key('class') and instagram_regex.search(blockquote['class']) is not None: # noqa
pq = Tag(soup, 'blockquote')
br = Tag(soup, 'br')
pq.insert(0, '[ Instagram ]')
pq.insert(1, br)
a = blockquote.find(name='a', attrs={'href': instagram_regex})
imgUrl = None
if a is not None:
imgUrl = self.get_instagram_pic(str(a['href']))
if imgUrl is not None:
img = Tag(soup, 'img', [('src', imgUrl)])
pq.insert(len(pq.contents), img)
for p in blockquote.findAll(name='p'):
x = 0
plen = len(p.contents)
while x < plen:
c = len(pq.contents)
if p.contents[x].string is not None:
pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE))
# else:
# pq.insert(c, p.contents[x].content)
x += 1
br = Tag(soup, 'br')
c = len(pq.contents)
pq.insert(c, br)
blockquote.replaceWith(pq)
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
elif alink.img is not None:
tstr = alink.img
alink.replaceWith(tstr)
elif alink.span is not None:
tstr = alink.span
alink.replaceWith(tstr)
return soup
def get_visualise_pic(self, url):
returnValue = None
try:
raw = self.browser.open(url).read()
except:
print '404: ' + url
return returnValue
bs = BeautifulSoup(raw)
imgRaw = bs.find(name='meta', attrs={'property': 'og:image'})
if imgRaw is not None:
returnValue = str(imgRaw['content'])
return returnValue
def get_twitter_pic(self, url):
returnValue = None
try:
raw = self.browser.open('https://' + url).read()
except:
print '404: ' + url
return returnValue
bs = BeautifulSoup(raw)
refresh = bs.find('meta', {'http-equiv': 'refresh'})
if refresh is not None:
content = refresh.get('content').partition('=')[2]
try:
raw = self.browser.open(content).read()
except:
print '404: ' + url
return returnValue
bs = BeautifulSoup(raw)
img = bs.find(name='img', attrs={
'alt': re.compile('.*permalink.*', re.IGNORECASE)})
if img is not None:
returnValue = img
return returnValue
def get_soundcloud_pic(self, url):
# content loaded via javascript and require an login and/or registered application identification
# returnValue = None
# raw = self.browser.open(soundcloudUrl + '&visual=true').read()
# bs = BeautifulSoup(raw)
# imgRaw = bs.find(name='div', attrs={'style':re.compile(r'backgroud-image:*?',re.IGNORECASE)})
# if imgRaw is not None:
# returnValue = str(imgRaw['style'])
return None # returnValue
def get_instagram_pic(self, url):
returnValue = None
try:
raw = self.browser.open(url).read()
except:
print '404: ' + url
return returnValue
m = re.search('\"display_src\":\"(?P<url>http[s]?:.*?)\"', str(raw))
if m is not None:
returnValue = re.sub(r'\\', '', m.group(
"url"), flags=re.DOTALL | re.IGNORECASE)
return returnValue
def get_dailymotion_pic(self, url):
returnValue = None
try:
raw = self.browser.open(url).read()
except:
print '404: ' + url
return returnValue
m = re.search('("thumbnail_url\"\:\")(?P<url>http.*?)(\")', str(raw))
if m is not None:
returnValue = re.sub(r'\\', '', m.group(
"url"), flags=re.DOTALL | re.IGNORECASE)
return returnValue
def get_spotify_pic(self, url):
returnValue = None
try:
raw = self.browser.open(url).read()
except:
print '404: ' + url
return returnValue
m = re.search('data-ca=\"(?P<url>.*?)\"', str(raw))
if m is not None:
returnValue = m.group("url")
return returnValue
def get_vine_pic(self, url):
returnValue = None
try:
raw = self.browser.open(url).read()
except:
print '404: ' + url
return returnValue
m = re.search('"thumbnail.*?src=\"(?P<url>.*?)\"', str(raw))
if m is not None:
returnValue = m.group("url")
return returnValue
preprocess_regexps = [
(re.compile(r'<script\b.+?</script>', re.DOTALL | re.IGNORECASE), lambda h1: ''),
(re.compile(r'<a.* id="buy-tickets-button".*</a>', re.IGNORECASE), lambda h2: ''),
(re.compile(r'<a.* class="gallery.*</a>', re.IGNORECASE), lambda h2: ''),
]
extra_css = '''
h1 h2 {
font-family:Arial,Helvetica,sans-serif;
font-weight:bold;font-size:large;
}
h3 {
font-family:Arial,Helvetica,sans-serif;
font-weight:normal;
font-size:small;
font-style:italic;
display:inline;
}
body {
font-family:Helvetica,Arial,sans-serif;
font-size:small;
}
blockquote {
font-family:"Courier New",
Courier, monospace;
font-size:90%;
}
img {
display:block;
}
.date{
font-style:italic;
font-weight:normal;
}
.article_header>p:not(.date){
font-weight:bold;
}
'''

View File

@ -1,22 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
from calibre.web.feeds.news import BasicNewsRecipe
class TheResurgence(BasicNewsRecipe):
title = u'The Resurgence'
__author__ = 'Peter Grungi'
language = 'en'
oldest_article = 7
max_articles_per_feed = 10
auto_cleanup = True
cover_url = 'http://cdn.theresurgence.com/images/logo.png'
masthead_url = 'http://cdn.theresurgence.com/images/logo.png'
language = 'en'
publisher = 'The Resurgence'
author = 'The Resurgence'
feeds = [
(u'The Resurgence', u'http://feeds.theresurgence.com/TheResurgence?format=xml')]

View File

@ -10,30 +10,10 @@ class SecurityWatch(BasicNewsRecipe):
oldest_article = 14
max_articles_per_feed = 100
use_embedded_content = False
filter_regexps = [r'feedads\.googleadservices\.com']
filter_regexps = [r'ad\.doubleclick']
filter_regexps = [r'advert']
language = 'en'
extra_css = 'div {text-align:left}'
remove_tags = [dict(id='topBannerContainer'),
dict(id='topBannerSmall'),
dict(id='topSearchBar'),
dict(id='topSearchForm'),
dict(id='rtBannerMPU'),
dict(id='topNavBar'),
dict(id='breadcrumbs'),
# dict(id='entry-28272'),
dict(id='topSearchLinks'),
dict(name='span', attrs={'class': 'date'})]
remove_tags_after = [dict(id='googlemp')]
auto_cleanup = True
feeds = [
(u'securitywatch', u'http://feeds.ziffdavisenterprise.com/RSS/security_watch/')]
def postprocess_html(self, soup, first_fetch):
for t in soup.findAll(['table', 'tr', 'td']):
t.name = 'div'
return soup
(u'securitywatch',
u'http://feeds.pcmag.com/Rss.aspx/SectionArticles?sectionId=28026')
]

View File

@ -17,39 +17,38 @@ class AdvancedUserRecipe1315899507(BasicNewsRecipe):
auto_cleanup = True
remove_empty_feeds = True
publication_type = 'newspaper'
masthead_url = 'http://media.signonsandiego.com/e2/sosd/images/sosd_logo.png'
feeds = [
(u'Latest News', u'http://www.signonsandiego.com/rss/headlines/'),
(u'Local News', u'http://www.signonsandiego.com/rss/headlines/metro/'),
(u'Business', u'http://www.signonsandiego.com/rss/headlines/business/'),
(u'Politics', u'http://www.signonsandiego.com/rss/headlines/local/politics/'),
(u'Border & Immigration', u'http://www.signonsandiego.com/rss/headlines/border/'),
(u'Courts', u'http://www.signonsandiego.com/rss/headlines/courts/'),
(u'Education', u'http://www.signonsandiego.com/news/education/'),
(u'Sports', u'http://www.signonsandiego.com/rss/headlines/sports/'),
(u'Chargers', u'http://www.signonsandiego.com/rss/headlines/sports/chargers/'),
(u'Padres', u'http://www.signonsandiego.com/rss/headlines/sports/padres/'),
(u'NFL', u'http://www.signonsandiego.com/rss/headlines/sports/nfl/'),
(u'NBA', u'http://www.signonsandiego.com/rss/headlines/sports/nba/'),
(u'Nick Canepa', u'http://www.signonsandiego.com/rss/authors/nick-canepa/'),
(u'Tim Sullivan', u'http://www.signonsandiego.com/rss/authors/tim-sullivan/'),
(u'Ruben Navarrette', u'http://www.signonsandiego.com/rss/authors/ruben-navarrette/'),
(u'Diane Bell', u'http://www.signonsandiego.com/rss/authors/diane-bell/'),
(u'Smart Living', u'http://www.signonsandiego.com/rss/headlines/smart-living/'),
(u'Photos', u'http://www.signonsandiego.com/rss/photos/'),
(u'Arts', u'http://www.signonsandiego.com/rss/headlines/night-and-day/theater-arts/'),
(u'Books', u'http://www.signonsandiego.com/rss/headlines/lifestyle/books/'),
(u'Currents-Passages',
u'http://www.signonsandiego.com/rss/headlines/lifestyle/currents/passages/'),
(u'Currents-Weekend',
u'http://www.signonsandiego.com/news/rss2/daily/currentsweekend.xml'),
(u'Dialog', u'http://www.signonsandiego.com/news/rss2/daily/dialog.xml'),
(u'Home', u'http://www.signonsandiego.com/rss/headlines/home/'),
(u'Homescape', u'http://www.signonsandiego.com/rss/headlines/lifestyle/homescape/'),
(u'Night & Day', u'http://www.signonsandiego.com/news/rss2/daily/nightday.xml'),
(u'Opinion', u'http://www.signonsandiego.com/rss/headlines/opinion/'),
(u'Quest', u'http://www.signonsandiego.com/news/rss2/daily/quest.xml'),
(u'Travel', u'http://www.signonsandiego.com/news/rss2/daily/travel.xml'),
(u'Wheels', u'http://www.signonsandiego.com/news/rss2/daily/wheels.xml')
(u'Latest News',
u'http://www.sandiegouniontribune.com/latest/rss2.0.xml'),
(u'Business',
u'http://www.sandiegouniontribune.com/business/rss2.0.xml'),
(u'Politics',
u'http://www.sandiegouniontribune.com/news/politics/rss2.0.xml'),
(u'Immigration',
u'http://www.sandiegouniontribune.com/news/immigration/rss2.0.xml'),
(u'Courts',
u'http://www.sandiegouniontribune.com/news/public-safety/rss2.0.xml'),
(u'Education',
u'http://www.sandiegouniontribune.com/news/education/rss2.0.xml'),
(u'Sports',
u'http://www.sandiegouniontribune.com/sports/rss2.0.xml'),
(u'Chargers',
u'http://www.sandiegouniontribune.com/sports/chargers/rss2.0.xml'),
(u'Padres',
u'http://www.sandiegouniontribune.com/sports/padres/rss2.0.xml'),
(u'NFL',
u'http://www.sandiegouniontribune.com/sports/nfl/rss2.0.xml'),
(u'NBA',
u'http://www.sandiegouniontribune.com/sports/nba/rss2.0.xml'),
(u'Photos',
u'http://www.sandiegouniontribune.com/visuals/rss2.0.xml'),
(u'Entertainment',
u'http://www.sandiegouniontribune.com/entertainment/rss2.0.xml'),
(u'Books',
u'http://www.sandiegouniontribune.com/entertainment/books/rss2.0.xml'),
(u'Opinion',
u'http://www.sandiegouniontribune.com/opinion/rss2.0.xml'),
(u'Travel',
u'http://www.sandiegouniontribune.com/lifestyle/travel/rss2.0.xml'),
]

View File

@ -28,5 +28,5 @@ class Starbulletin(BasicNewsRecipe):
(u'Business', u'http://www.staradvertiser.com/business/feed/'),
(u'Sports', u'http://www.staradvertiser.com/sports/feed/'),
(u'Features',
u'http://www.staradvertiser.com/featurespremium/index.rss')
u'http://www.staradvertiser.com/features/feed/')
]

View File

@ -1,97 +0,0 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class TelevisionWithoutPity(BasicNewsRecipe):
title = u'Television Without Pity'
language = 'en'
__author__ = 'Snarkastica'
# Used for pulling down an entire show, not just the RSS feed
SHOW = 'http://www.televisionwithoutpity.com/show/SHOW-NAME-HERE/recaps/'
oldest_article = 7 # days
max_articles_per_feed = 25
# reverse_article_order=True # Useful for entire show, to display in episode order
use_embedded_content = False
preprocess_regexps = [(re.compile(r'<span class="headline_recap_title .*?>',
re.DOTALL | re.IGNORECASE), lambda match: '<span class="headline_recap_title">')]
keep_only_tags = [dict(name='span', attrs={'class': 'headline_recap_title'}), dict(
name='p', attrs={'class': 'byline'}), dict(name='div', attrs={'class': 'body_recap'}), dict(name='h1')]
no_stylesheets = True
# Comment this out and configure process_index() to retrieve a single show
feeds = [
('Ltest Recaps',
'http://www.televisionwithoutpity.com/rss.xml'),
]
'''
This method can be used to grab all recaps for a single show
Set the SHOW constant at the beginning of this file to the URL for a show's recap page
(the page listing all recaps, usually of the form:
http://www.televisionwithoutpity.com/show/SHOW-NAME/recaps/"
Where SHOW-NAME is the hyphenated name of the show.
To use:
1. Comment out feeds = [...] earlier in this file
2. Set the SHOW constant to the show's recap page
3. Uncomment the following function
'''
'''
def parse_index(self):
soup = self.index_to_soup(self.SHOW)
feeds = []
articles = []
showTitle = soup.find('h1').string
recaps = soup.find('table')
for ep in recaps.findAll('tr'):
epData = ep.findAll('td')
epNum = epData[0].find(text=True).strip()
if not epNum == "Ep.":
epT = self.tag_to_string(epData[1].find('em')).strip()
epST = " (or " + self.tag_to_string(epData[1].find('h3')).strip() + ")"
epTitle = epNum + ": " + epT + epST
epData[1].find('em').extract()
epURL = epData[1].find('a', href=True)
epURL = epURL['href']
epSum = self.tag_to_string(epData[1].find('p')).strip()
epDate = epData[2].find(text=True).strip()
epAuthor = self.tag_to_string(epData[4].find('p')).strip()
articles.append({'title':epTitle, 'url':epURL, 'description':epSum, 'date':epDate, 'author':epAuthor})
feeds.append((showTitle, articles))
#self.abort_recipe_processing("test")
return feeds
'''
# This will add subsequent pages of multipage recaps to a single article
# page
def append_page(self, soup, appendtag, position):
# If false, will still grab single-page recaplets
if (soup.find('p', attrs={'class': 'pages'})):
pager = soup.find('p', attrs={'class': 'pages'}).find(text='Next')
if pager:
nexturl = pager.parent['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class': 'body_recap'})
for it in texttag.findAll(style=True):
del it['style']
newpos = len(texttag.contents)
self.append_page(soup2, texttag, newpos)
texttag.extract()
appendtag.insert(position, texttag)
def preprocess_html(self, soup):
self.append_page(soup, soup.body, 3)
return soup
# Remove the multi page links (we had to keep these in for append_page(), but they can go away now
# Could have used CSS to hide, but some readers ignore CSS.
def postprocess_html(self, soup, first_fetch):
paginator = soup.findAll('p', attrs={'class': 'pages'})
if paginator:
for p in paginator:
p.extract()
# TODO: Fix this so it converts the headline class into a heading 1
return soup