mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Merge branch 'master' of https://github.com/CoderAllan/calibre
This commit is contained in:
commit
96947fd01a
@ -18,35 +18,6 @@ class EdgeConversationRSS(BasicNewsRecipe):
|
||||
oldest_article = 60
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class': 'HomeLeftPannel IMGCTRL'})]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class': 'Logo'})
|
||||
]
|
||||
|
||||
feeds = [(u'Edge RSS', u'http://edge.org/feeds/')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('conversation/', 'conversation.php?cid=')
|
||||
|
||||
def parse_feeds(self):
|
||||
|
||||
# Call parent's method.
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
|
||||
# Loop through all feeds.
|
||||
for feed in feeds:
|
||||
|
||||
# Loop through all articles in feed.
|
||||
for article in feed.articles[:]:
|
||||
|
||||
# Remove anything that is not a conversation, and remove PDF
|
||||
# files as well...
|
||||
|
||||
if not ('CONVERSATION' in article.title):
|
||||
feed.articles.remove(article)
|
||||
elif 'pdf' in article.url:
|
||||
feed.articles.remove(article)
|
||||
|
||||
return feeds
|
||||
feeds = [(u'Edge RSS', u'http://edge.org/feed')]
|
||||
|
@ -1,53 +0,0 @@
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
class EuropeanVoice(BasicNewsRecipe):
|
||||
title = u'European Voice'
|
||||
__author__ = 'malfi'
|
||||
oldest_article = 14
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
cover_url = 'http://www.europeanvoice.com/Css/images/logo.gif'
|
||||
language = 'en'
|
||||
keep_only_tags = [dict(name='div', attrs={'id': 'articleLeftColumn'})]
|
||||
remove_tags = [dict(name='div', attrs={'id': 'BreadCrump'})]
|
||||
feeds = [
|
||||
(u'Whole site ', u'http://www.europeanvoice.com/Rss/2.xml'),
|
||||
(u'News and analysis', u'http://www.europeanvoice.com/Rss/6.xml'),
|
||||
(u'Comment', u'http://www.europeanvoice.com/Rss/7.xml'),
|
||||
(u'Special reports', u'http://www.europeanvoice.com/Rss/5.xml'),
|
||||
(u'People', u'http://www.europeanvoice.com/Rss/8.xml'),
|
||||
(u'Career', u'http://www.europeanvoice.com/Rss/11.xml'),
|
||||
(u'Policies', u'http://www.europeanvoice.com/Rss/4.xml'),
|
||||
(u'EVents', u'http://www.europeanvoice.com/Rss/10.xml'),
|
||||
(u'Policies - Economics', u'http://www.europeanvoice.com/Rss/31.xml'),
|
||||
(u'Policies - Business', u'http://www.europeanvoice.com/Rss/19.xml'),
|
||||
(u'Policies - Trade', u'http://www.europeanvoice.com/Rss/25.xml'),
|
||||
(u'Policies - Information society',
|
||||
u'http://www.europeanvoice.com/Rss/20.xml'),
|
||||
(u'Policies - Energy', u'http://www.europeanvoice.com/Rss/15.xml'),
|
||||
(u'Policies - Transport', u'http://www.europeanvoice.com/Rss/18.xml'),
|
||||
(u'Policies - Climate change', u'http://www.europeanvoice.com/Rss/16.xml'),
|
||||
(u'Policies - Environment', u'http://www.europeanvoice.com/Rss/17.xml'),
|
||||
(u'Policies - Farming & food', u'http://www.europeanvoice.com/Rss/23.xml'),
|
||||
(u'Policies - Health & society', u'http://www.europeanvoice.com/Rss/24.xml'),
|
||||
(u'Policies - Justice', u'http://www.europeanvoice.com/Rss/29.xml'),
|
||||
(u'Policies - Foreign affairs', u'http://www.europeanvoice.com/Rss/27.xml')
|
||||
]
|
||||
extra_css = '''
|
||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||
'''
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?bPrint=1'
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
denied = soup.findAll(True, text='Subscribers')
|
||||
if denied:
|
||||
raise Exception(
|
||||
'Article skipped, because content can only be seen with subscription')
|
||||
return soup
|
@ -15,29 +15,17 @@ class Kitsapsun(BasicNewsRecipe):
|
||||
publisher = 'Scripps Interactive Newspapers Group'
|
||||
category = 'news, Kitsap county, USA'
|
||||
language = 'en'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 50
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
use_embedded_content = False
|
||||
auto_cleanup = True
|
||||
|
||||
conversion_options = {
|
||||
'comments': description, 'tags': category, 'language': language, 'publisher': publisher
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id': ['story_meta', 'story_content']})]
|
||||
|
||||
remove_tags = [dict(name=['object', 'link', 'embed', 'form', 'iframe'])]
|
||||
|
||||
feeds = [
|
||||
|
||||
(u'News', u'http://www.kitsapsun.com/rss/headlines/news/'),
|
||||
(u'Business', u'http://www.kitsapsun.com/rss/headlines/business/'),
|
||||
(u'Communities', u'http://www.kitsapsun.com/rss/headlines/communities/'),
|
||||
(u'Entertainment', u'http://www.kitsapsun.com/rss/headlines/entertainment/'),
|
||||
(u'Lifestyles', u'http://www.kitsapsun.com/rss/headlines/lifestyles/')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.rpartition('/')[0] + '/?print=1'
|
||||
feeds = [(u'News', u'http://www.kitsapsun.com/feeds/rss/news'),
|
||||
(u'Sports', u'http://www.kitsapsun.com/feeds/rss/sports'),
|
||||
(u'Entertainment',
|
||||
u'http://www.kitsapsun.com/feeds/rss/entertainment'),
|
||||
(u'Lifestyles', u'http://www.kitsapsun.com/feeds/rss/lifestyle'),
|
||||
(u'Opinion', u'http://www.kitsapsun.com/feeds/rss/opinion'),
|
||||
]
|
||||
|
@ -14,5 +14,5 @@ class HindustanTimes(BasicNewsRecipe):
|
||||
|
||||
feeds = [
|
||||
('News',
|
||||
'http://www.mobilenations.com/rss/mb.xml'),
|
||||
'http://www.mobilenations.com/about?format=RSS'),
|
||||
]
|
||||
|
@ -1,8 +1,4 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre import browser
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
|
||||
|
||||
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||
@ -31,383 +27,10 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||
simultaneous_downloads = 20
|
||||
use_embedded_content = False
|
||||
recursions = 0
|
||||
|
||||
conversion_options = {
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
|
||||
}
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [
|
||||
(u'NME News', u'http://www.nme.com/rss/news'),
|
||||
(u'Reviews', u'http://www.nme.com/rss/reviews'),
|
||||
(u'Blogs', u'http://www.nme.com/rss/blogs'),
|
||||
(u'NME News', u'http://www.nme.com/news/feed'),
|
||||
(u'Reviews', u'http://www.nme.com/reviews/feed/'),
|
||||
(u'Blogs', u'http://www.nme.com/blogs/feed'),
|
||||
]
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id': 'content'}),
|
||||
]
|
||||
|
||||
remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan',
|
||||
'valign', 'vspace', 'hspace', 'alt', 'width', 'height']
|
||||
|
||||
remove_tags = [
|
||||
dict(name='meta'),
|
||||
dict(name='span', attrs={'class': 'article_info'}),
|
||||
dict(name='div', attrs={'class': 'breadcrumbs'}),
|
||||
dict(name='div', attrs={'class': 'mugshot'}),
|
||||
dict(name='div', attrs={'class': 'header'}),
|
||||
dict(name='div', attrs={'class': re.compile(
|
||||
'youtube.*', re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class': re.compile(
|
||||
'socialbuttons.*', re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class': 'clear_both'}),
|
||||
dict(name='div', attrs={'class': re.compile(
|
||||
'headline.*', re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class': 'member-signedout'}),
|
||||
dict(name='div', attrs={'class': re.compile(
|
||||
'prev_next.*', re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class': re.compile(
|
||||
'article_related.*', re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class': re.compile(
|
||||
'feature_bar.*', re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class': re.compile('ebay.*', re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'id': re.compile(
|
||||
'morenews.*', re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'id': re.compile(
|
||||
'ticketspopup.*', re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'id': re.compile(
|
||||
'ratemy_logprompt.*', re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'id': re.compile(
|
||||
'related_artist.*', re.IGNORECASE)}),
|
||||
dict(name='img', attrs={'class': re.compile(
|
||||
'video_play_large.*', re.IGNORECASE)}),
|
||||
dict(name='ul', attrs={'class': re.compile(
|
||||
'prev_next.*', re.IGNORECASE)}),
|
||||
dict(name='ul', attrs={'class': re.compile(
|
||||
'nme_store.*', re.IGNORECASE)}),
|
||||
dict(name='p', attrs={'class': re.compile('top', re.IGNORECASE)}),
|
||||
dict(name='table', attrs={
|
||||
'class': re.compile('tickets.*', re.IGNORECASE)}),
|
||||
]
|
||||
|
||||
masthead_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg'
|
||||
|
||||
def get_cover_url(self):
|
||||
magazine_page_raw = self.index_to_soup(
|
||||
'http://www.nme.com/magazine', raw=True)
|
||||
magazine_page_raw = re.sub(
|
||||
r'<script\b.+?</script>', '', magazine_page_raw, flags=re.DOTALL | re.IGNORECASE)
|
||||
magazine_page_raw = re.sub(
|
||||
r'\!\[if ', '!--[if ', magazine_page_raw, flags=re.DOTALL | re.IGNORECASE)
|
||||
magazine_page = self.index_to_soup(magazine_page_raw)
|
||||
cov = magazine_page.find('img', attrs={'class': 'magcover'})
|
||||
|
||||
cov2 = str(cov['src'])
|
||||
|
||||
br = browser()
|
||||
br.set_handle_redirect(False)
|
||||
try:
|
||||
br.open_novisit(cov2)
|
||||
cover_url = str(cov2)
|
||||
except:
|
||||
cover_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg'
|
||||
return cover_url
|
||||
|
||||
def preprocess_raw_html(self, raw_html, url):
|
||||
'''
|
||||
Need this for a bug on site that prevents blogg post being parsed correctly
|
||||
'''
|
||||
raw_html = re.sub(r'\!\[if ', '!--[if ', raw_html,
|
||||
flags=re.DOTALL | re.IGNORECASE)
|
||||
|
||||
return raw_html
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
youtube_regex = re.compile(
|
||||
r'(?:youtube\.com/(?:[^/]+/.+/|(?:v|e(?:mbed)?)/|.*[?&]v=)|youtu\.be/)(?P<id>[^"&?/ ]{11})', re.DOTALL | re.IGNORECASE)
|
||||
instagram_regex = re.compile(
|
||||
r'.*?instagram.*?', re.DOTALL | re.IGNORECASE)
|
||||
twitter_regex = re.compile(r'.*?twitter.*?', re.DOTALL | re.IGNORECASE)
|
||||
visualise_regex = re.compile(
|
||||
r'.*?visualise.*?', re.DOTALL | re.IGNORECASE)
|
||||
soundcloud_regex = re.compile(
|
||||
r'(?P<url>.*?(w|api)\.soundcloud.*?com\/(tracks|playlists)\/\d{8,9})', re.DOTALL | re.IGNORECASE)
|
||||
dailymotion_regex = re.compile(
|
||||
r'.*?dailymotion.*?', re.DOTALL | re.IGNORECASE)
|
||||
spotify_regex = re.compile(r'.*?spotify.*?', re.DOTALL | re.IGNORECASE)
|
||||
vine_regex = re.compile(r'.*?vine.*?', re.DOTALL | re.IGNORECASE)
|
||||
doubleHtmlEntities = re.compile(
|
||||
ur'(&)(?P<e>[\d\w\#]*;)', re.DOTALL | re.IGNORECASE | re.UNICODE)
|
||||
for iframe in soup.findAll('iframe'):
|
||||
if iframe.has_key('src') and youtube_regex.search(iframe['src']) is not None: # noqa
|
||||
pq = Tag(soup, 'blockquote')
|
||||
br = Tag(soup, 'br')
|
||||
pq.insert(0, '[ YouTube ]')
|
||||
pq.insert(1, br)
|
||||
m = youtube_regex.search(iframe['src'])
|
||||
if m.group('id') is not None:
|
||||
imgTag = Tag(soup, 'img', [
|
||||
('src', 'http://img.youtube.com/vi/' + m.group('id') + '/0.jpg')])
|
||||
pq.insert(len(pq.contents), imgTag)
|
||||
pq.insert(len(pq.contents), iframe['src'])
|
||||
iframe.replaceWith(pq)
|
||||
elif iframe.has_key('src') and soundcloud_regex.search(iframe['src']) is not None: # noqa
|
||||
m = soundcloud_regex.search(iframe['src'])
|
||||
pq = Tag(soup, 'blockquote')
|
||||
br = Tag(soup, 'br')
|
||||
pq.insert(0, '[ SoundCloud ]')
|
||||
pq.insert(1, br)
|
||||
pq.insert(2, m.group('url'))
|
||||
iframe.replaceWith(pq)
|
||||
elif iframe.has_key('src') and dailymotion_regex.search(iframe['src']) is not None: # noqa
|
||||
pq = Tag(soup, 'blockquote')
|
||||
br = Tag(soup, 'br')
|
||||
pq.insert(0, '[ DailyMotion ]')
|
||||
pq.insert(1, br)
|
||||
imgUrl = self.get_dailymotion_pic(iframe['src'])
|
||||
if imgUrl is not None:
|
||||
imgTag = Tag(soup, 'img', [('src', imgUrl)])
|
||||
pq.insert(len(pq.contents), imgTag)
|
||||
pq.insert(len(pq.contents), iframe['src'])
|
||||
iframe.replaceWith(pq)
|
||||
elif iframe.has_key('src') and spotify_regex.search(iframe['src']) is not None: # noqa
|
||||
pq = Tag(soup, 'blockquote')
|
||||
br = Tag(soup, 'br')
|
||||
pq.insert(0, '[ Spotify ]')
|
||||
pq.insert(1, br)
|
||||
imgUrl = self.get_spotify_pic(iframe['src'])
|
||||
if imgUrl is not None:
|
||||
imgTag = Tag(soup, 'img', [('src', imgUrl)])
|
||||
pq.insert(len(pq.contents), imgTag)
|
||||
pq.insert(len(pq.contents), iframe['src'])
|
||||
iframe.replaceWith(pq)
|
||||
elif iframe.has_key('src') and vine_regex.search(iframe['src']) is not None: # noqa
|
||||
pq = Tag(soup, 'blockquote')
|
||||
br = Tag(soup, 'br')
|
||||
pq.insert(0, '[ Vine ]')
|
||||
pq.insert(1, br)
|
||||
imgUrl = self.get_vine_pic(iframe['src'])
|
||||
if imgUrl is not None:
|
||||
imgTag = Tag(soup, 'img', [('src', imgUrl)])
|
||||
pq.insert(len(pq.contents), imgTag)
|
||||
pq.insert(len(pq.contents), iframe['src'])
|
||||
iframe.replaceWith(pq)
|
||||
elif iframe.has_key('src') and visualise_regex.search(iframe['src']) is not None: # noqa
|
||||
imgUrl = self.get_visualise_pic(iframe['src'])
|
||||
if imgUrl is not None:
|
||||
imgTag = Tag(soup, 'img', [('src', imgUrl)])
|
||||
iframe.replaceWith(imgTag)
|
||||
for blockquote in soup.findAll('blockquote'):
|
||||
if blockquote.has_key('class') and twitter_regex.search(blockquote['class']) is not None: # noqa
|
||||
pq = Tag(soup, 'blockquote')
|
||||
br = Tag(soup, 'br')
|
||||
pq.insert(0, '[ Twitter ]')
|
||||
pq.insert(len(pq.contents), br)
|
||||
match = re.search(
|
||||
"(?P<url>pic\.twitter[^\s<]+)", str(blockquote))
|
||||
if match is not None:
|
||||
img = self.get_twitter_pic(str(match.group("url")))
|
||||
if img is not None:
|
||||
pq.insert(len(pq.contents), img)
|
||||
for p in blockquote.findAll(name='p'):
|
||||
x = 0
|
||||
plen = len(p.contents)
|
||||
while True:
|
||||
c = len(pq.contents)
|
||||
if p.contents[x].string is not None:
|
||||
pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
|
||||
2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE))
|
||||
else:
|
||||
pq.insert(c, p.contents[x].content)
|
||||
x += 1
|
||||
if x == plen:
|
||||
break
|
||||
br = Tag(soup, 'br')
|
||||
pq.insert(len(pq.contents), br)
|
||||
p.extract()
|
||||
if len(blockquote.contents) > 0:
|
||||
x = 0
|
||||
xlen = len(blockquote.contents)
|
||||
while True:
|
||||
c = len(pq.contents)
|
||||
if blockquote.contents[x].string is not None:
|
||||
pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
|
||||
2), str(blockquote.contents[x].string), re.IGNORECASE | re.UNICODE))
|
||||
else:
|
||||
pq.insert(c, blockquote.contents[x].content)
|
||||
x += 1
|
||||
if x == xlen:
|
||||
break
|
||||
blockquote.replaceWith(pq)
|
||||
elif blockquote.has_key('class') and instagram_regex.search(blockquote['class']) is not None: # noqa
|
||||
pq = Tag(soup, 'blockquote')
|
||||
br = Tag(soup, 'br')
|
||||
pq.insert(0, '[ Instagram ]')
|
||||
pq.insert(1, br)
|
||||
a = blockquote.find(name='a', attrs={'href': instagram_regex})
|
||||
imgUrl = None
|
||||
if a is not None:
|
||||
imgUrl = self.get_instagram_pic(str(a['href']))
|
||||
if imgUrl is not None:
|
||||
img = Tag(soup, 'img', [('src', imgUrl)])
|
||||
pq.insert(len(pq.contents), img)
|
||||
for p in blockquote.findAll(name='p'):
|
||||
x = 0
|
||||
plen = len(p.contents)
|
||||
while x < plen:
|
||||
c = len(pq.contents)
|
||||
if p.contents[x].string is not None:
|
||||
pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
|
||||
2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE))
|
||||
# else:
|
||||
# pq.insert(c, p.contents[x].content)
|
||||
x += 1
|
||||
br = Tag(soup, 'br')
|
||||
c = len(pq.contents)
|
||||
pq.insert(c, br)
|
||||
blockquote.replaceWith(pq)
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
elif alink.img is not None:
|
||||
tstr = alink.img
|
||||
alink.replaceWith(tstr)
|
||||
elif alink.span is not None:
|
||||
tstr = alink.span
|
||||
alink.replaceWith(tstr)
|
||||
return soup
|
||||
|
||||
def get_visualise_pic(self, url):
|
||||
returnValue = None
|
||||
try:
|
||||
raw = self.browser.open(url).read()
|
||||
except:
|
||||
print '404: ' + url
|
||||
return returnValue
|
||||
bs = BeautifulSoup(raw)
|
||||
imgRaw = bs.find(name='meta', attrs={'property': 'og:image'})
|
||||
if imgRaw is not None:
|
||||
returnValue = str(imgRaw['content'])
|
||||
return returnValue
|
||||
|
||||
def get_twitter_pic(self, url):
|
||||
returnValue = None
|
||||
try:
|
||||
raw = self.browser.open('https://' + url).read()
|
||||
except:
|
||||
print '404: ' + url
|
||||
return returnValue
|
||||
bs = BeautifulSoup(raw)
|
||||
refresh = bs.find('meta', {'http-equiv': 'refresh'})
|
||||
if refresh is not None:
|
||||
content = refresh.get('content').partition('=')[2]
|
||||
try:
|
||||
raw = self.browser.open(content).read()
|
||||
except:
|
||||
print '404: ' + url
|
||||
return returnValue
|
||||
bs = BeautifulSoup(raw)
|
||||
img = bs.find(name='img', attrs={
|
||||
'alt': re.compile('.*permalink.*', re.IGNORECASE)})
|
||||
if img is not None:
|
||||
returnValue = img
|
||||
return returnValue
|
||||
|
||||
def get_soundcloud_pic(self, url):
|
||||
# content loaded via javascript and require an login and/or registered application identification
|
||||
# returnValue = None
|
||||
# raw = self.browser.open(soundcloudUrl + '&visual=true').read()
|
||||
# bs = BeautifulSoup(raw)
|
||||
# imgRaw = bs.find(name='div', attrs={'style':re.compile(r'backgroud-image:*?',re.IGNORECASE)})
|
||||
# if imgRaw is not None:
|
||||
# returnValue = str(imgRaw['style'])
|
||||
return None # returnValue
|
||||
|
||||
def get_instagram_pic(self, url):
|
||||
returnValue = None
|
||||
try:
|
||||
raw = self.browser.open(url).read()
|
||||
except:
|
||||
print '404: ' + url
|
||||
return returnValue
|
||||
m = re.search('\"display_src\":\"(?P<url>http[s]?:.*?)\"', str(raw))
|
||||
if m is not None:
|
||||
returnValue = re.sub(r'\\', '', m.group(
|
||||
"url"), flags=re.DOTALL | re.IGNORECASE)
|
||||
return returnValue
|
||||
|
||||
def get_dailymotion_pic(self, url):
|
||||
returnValue = None
|
||||
try:
|
||||
raw = self.browser.open(url).read()
|
||||
except:
|
||||
print '404: ' + url
|
||||
return returnValue
|
||||
m = re.search('("thumbnail_url\"\:\")(?P<url>http.*?)(\")', str(raw))
|
||||
if m is not None:
|
||||
returnValue = re.sub(r'\\', '', m.group(
|
||||
"url"), flags=re.DOTALL | re.IGNORECASE)
|
||||
return returnValue
|
||||
|
||||
def get_spotify_pic(self, url):
|
||||
returnValue = None
|
||||
try:
|
||||
raw = self.browser.open(url).read()
|
||||
except:
|
||||
print '404: ' + url
|
||||
return returnValue
|
||||
m = re.search('data-ca=\"(?P<url>.*?)\"', str(raw))
|
||||
if m is not None:
|
||||
returnValue = m.group("url")
|
||||
return returnValue
|
||||
|
||||
def get_vine_pic(self, url):
|
||||
returnValue = None
|
||||
try:
|
||||
raw = self.browser.open(url).read()
|
||||
except:
|
||||
print '404: ' + url
|
||||
return returnValue
|
||||
m = re.search('"thumbnail.*?src=\"(?P<url>.*?)\"', str(raw))
|
||||
if m is not None:
|
||||
returnValue = m.group("url")
|
||||
return returnValue
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<script\b.+?</script>', re.DOTALL | re.IGNORECASE), lambda h1: ''),
|
||||
(re.compile(r'<a.* id="buy-tickets-button".*</a>', re.IGNORECASE), lambda h2: ''),
|
||||
(re.compile(r'<a.* class="gallery.*</a>', re.IGNORECASE), lambda h2: ''),
|
||||
]
|
||||
|
||||
extra_css = '''
|
||||
h1 h2 {
|
||||
font-family:Arial,Helvetica,sans-serif;
|
||||
font-weight:bold;font-size:large;
|
||||
}
|
||||
h3 {
|
||||
font-family:Arial,Helvetica,sans-serif;
|
||||
font-weight:normal;
|
||||
font-size:small;
|
||||
font-style:italic;
|
||||
display:inline;
|
||||
}
|
||||
body {
|
||||
font-family:Helvetica,Arial,sans-serif;
|
||||
font-size:small;
|
||||
}
|
||||
blockquote {
|
||||
font-family:"Courier New",
|
||||
Courier, monospace;
|
||||
font-size:90%;
|
||||
}
|
||||
img {
|
||||
display:block;
|
||||
}
|
||||
.date{
|
||||
font-style:italic;
|
||||
font-weight:normal;
|
||||
}
|
||||
.article_header>p:not(.date){
|
||||
font-weight:bold;
|
||||
}
|
||||
'''
|
||||
|
@ -1,22 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
class TheResurgence(BasicNewsRecipe):
|
||||
title = u'The Resurgence'
|
||||
__author__ = 'Peter Grungi'
|
||||
language = 'en'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 10
|
||||
auto_cleanup = True
|
||||
cover_url = 'http://cdn.theresurgence.com/images/logo.png'
|
||||
masthead_url = 'http://cdn.theresurgence.com/images/logo.png'
|
||||
language = 'en'
|
||||
publisher = 'The Resurgence'
|
||||
author = 'The Resurgence'
|
||||
|
||||
feeds = [
|
||||
(u'The Resurgence', u'http://feeds.theresurgence.com/TheResurgence?format=xml')]
|
@ -10,30 +10,10 @@ class SecurityWatch(BasicNewsRecipe):
|
||||
oldest_article = 14
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
filter_regexps = [r'feedads\.googleadservices\.com']
|
||||
filter_regexps = [r'ad\.doubleclick']
|
||||
filter_regexps = [r'advert']
|
||||
language = 'en'
|
||||
|
||||
extra_css = 'div {text-align:left}'
|
||||
|
||||
remove_tags = [dict(id='topBannerContainer'),
|
||||
dict(id='topBannerSmall'),
|
||||
dict(id='topSearchBar'),
|
||||
dict(id='topSearchForm'),
|
||||
dict(id='rtBannerMPU'),
|
||||
dict(id='topNavBar'),
|
||||
dict(id='breadcrumbs'),
|
||||
# dict(id='entry-28272'),
|
||||
dict(id='topSearchLinks'),
|
||||
dict(name='span', attrs={'class': 'date'})]
|
||||
|
||||
remove_tags_after = [dict(id='googlemp')]
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [
|
||||
(u'securitywatch', u'http://feeds.ziffdavisenterprise.com/RSS/security_watch/')]
|
||||
|
||||
def postprocess_html(self, soup, first_fetch):
|
||||
for t in soup.findAll(['table', 'tr', 'td']):
|
||||
t.name = 'div'
|
||||
return soup
|
||||
(u'securitywatch',
|
||||
u'http://feeds.pcmag.com/Rss.aspx/SectionArticles?sectionId=28026')
|
||||
]
|
||||
|
@ -17,39 +17,38 @@ class AdvancedUserRecipe1315899507(BasicNewsRecipe):
|
||||
auto_cleanup = True
|
||||
remove_empty_feeds = True
|
||||
publication_type = 'newspaper'
|
||||
masthead_url = 'http://media.signonsandiego.com/e2/sosd/images/sosd_logo.png'
|
||||
|
||||
feeds = [
|
||||
(u'Latest News', u'http://www.signonsandiego.com/rss/headlines/'),
|
||||
(u'Local News', u'http://www.signonsandiego.com/rss/headlines/metro/'),
|
||||
(u'Business', u'http://www.signonsandiego.com/rss/headlines/business/'),
|
||||
(u'Politics', u'http://www.signonsandiego.com/rss/headlines/local/politics/'),
|
||||
(u'Border & Immigration', u'http://www.signonsandiego.com/rss/headlines/border/'),
|
||||
(u'Courts', u'http://www.signonsandiego.com/rss/headlines/courts/'),
|
||||
(u'Education', u'http://www.signonsandiego.com/news/education/'),
|
||||
(u'Sports', u'http://www.signonsandiego.com/rss/headlines/sports/'),
|
||||
(u'Chargers', u'http://www.signonsandiego.com/rss/headlines/sports/chargers/'),
|
||||
(u'Padres', u'http://www.signonsandiego.com/rss/headlines/sports/padres/'),
|
||||
(u'NFL', u'http://www.signonsandiego.com/rss/headlines/sports/nfl/'),
|
||||
(u'NBA', u'http://www.signonsandiego.com/rss/headlines/sports/nba/'),
|
||||
(u'Nick Canepa', u'http://www.signonsandiego.com/rss/authors/nick-canepa/'),
|
||||
(u'Tim Sullivan', u'http://www.signonsandiego.com/rss/authors/tim-sullivan/'),
|
||||
(u'Ruben Navarrette', u'http://www.signonsandiego.com/rss/authors/ruben-navarrette/'),
|
||||
(u'Diane Bell', u'http://www.signonsandiego.com/rss/authors/diane-bell/'),
|
||||
(u'Smart Living', u'http://www.signonsandiego.com/rss/headlines/smart-living/'),
|
||||
(u'Photos', u'http://www.signonsandiego.com/rss/photos/'),
|
||||
(u'Arts', u'http://www.signonsandiego.com/rss/headlines/night-and-day/theater-arts/'),
|
||||
(u'Books', u'http://www.signonsandiego.com/rss/headlines/lifestyle/books/'),
|
||||
(u'Currents-Passages',
|
||||
u'http://www.signonsandiego.com/rss/headlines/lifestyle/currents/passages/'),
|
||||
(u'Currents-Weekend',
|
||||
u'http://www.signonsandiego.com/news/rss2/daily/currentsweekend.xml'),
|
||||
(u'Dialog', u'http://www.signonsandiego.com/news/rss2/daily/dialog.xml'),
|
||||
(u'Home', u'http://www.signonsandiego.com/rss/headlines/home/'),
|
||||
(u'Homescape', u'http://www.signonsandiego.com/rss/headlines/lifestyle/homescape/'),
|
||||
(u'Night & Day', u'http://www.signonsandiego.com/news/rss2/daily/nightday.xml'),
|
||||
(u'Opinion', u'http://www.signonsandiego.com/rss/headlines/opinion/'),
|
||||
(u'Quest', u'http://www.signonsandiego.com/news/rss2/daily/quest.xml'),
|
||||
(u'Travel', u'http://www.signonsandiego.com/news/rss2/daily/travel.xml'),
|
||||
(u'Wheels', u'http://www.signonsandiego.com/news/rss2/daily/wheels.xml')
|
||||
(u'Latest News',
|
||||
u'http://www.sandiegouniontribune.com/latest/rss2.0.xml'),
|
||||
(u'Business',
|
||||
u'http://www.sandiegouniontribune.com/business/rss2.0.xml'),
|
||||
(u'Politics',
|
||||
u'http://www.sandiegouniontribune.com/news/politics/rss2.0.xml'),
|
||||
(u'Immigration',
|
||||
u'http://www.sandiegouniontribune.com/news/immigration/rss2.0.xml'),
|
||||
(u'Courts',
|
||||
u'http://www.sandiegouniontribune.com/news/public-safety/rss2.0.xml'),
|
||||
(u'Education',
|
||||
u'http://www.sandiegouniontribune.com/news/education/rss2.0.xml'),
|
||||
(u'Sports',
|
||||
u'http://www.sandiegouniontribune.com/sports/rss2.0.xml'),
|
||||
(u'Chargers',
|
||||
u'http://www.sandiegouniontribune.com/sports/chargers/rss2.0.xml'),
|
||||
(u'Padres',
|
||||
u'http://www.sandiegouniontribune.com/sports/padres/rss2.0.xml'),
|
||||
(u'NFL',
|
||||
u'http://www.sandiegouniontribune.com/sports/nfl/rss2.0.xml'),
|
||||
(u'NBA',
|
||||
u'http://www.sandiegouniontribune.com/sports/nba/rss2.0.xml'),
|
||||
(u'Photos',
|
||||
u'http://www.sandiegouniontribune.com/visuals/rss2.0.xml'),
|
||||
(u'Entertainment',
|
||||
u'http://www.sandiegouniontribune.com/entertainment/rss2.0.xml'),
|
||||
(u'Books',
|
||||
u'http://www.sandiegouniontribune.com/entertainment/books/rss2.0.xml'),
|
||||
(u'Opinion',
|
||||
u'http://www.sandiegouniontribune.com/opinion/rss2.0.xml'),
|
||||
(u'Travel',
|
||||
u'http://www.sandiegouniontribune.com/lifestyle/travel/rss2.0.xml'),
|
||||
]
|
||||
|
@ -28,5 +28,5 @@ class Starbulletin(BasicNewsRecipe):
|
||||
(u'Business', u'http://www.staradvertiser.com/business/feed/'),
|
||||
(u'Sports', u'http://www.staradvertiser.com/sports/feed/'),
|
||||
(u'Features',
|
||||
u'http://www.staradvertiser.com/featurespremium/index.rss')
|
||||
u'http://www.staradvertiser.com/features/feed/')
|
||||
]
|
||||
|
@ -1,97 +0,0 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
|
||||
class TelevisionWithoutPity(BasicNewsRecipe):
|
||||
title = u'Television Without Pity'
|
||||
language = 'en'
|
||||
__author__ = 'Snarkastica'
|
||||
# Used for pulling down an entire show, not just the RSS feed
|
||||
SHOW = 'http://www.televisionwithoutpity.com/show/SHOW-NAME-HERE/recaps/'
|
||||
oldest_article = 7 # days
|
||||
max_articles_per_feed = 25
|
||||
# reverse_article_order=True # Useful for entire show, to display in episode order
|
||||
use_embedded_content = False
|
||||
|
||||
preprocess_regexps = [(re.compile(r'<span class="headline_recap_title .*?>',
|
||||
re.DOTALL | re.IGNORECASE), lambda match: '<span class="headline_recap_title">')]
|
||||
keep_only_tags = [dict(name='span', attrs={'class': 'headline_recap_title'}), dict(
|
||||
name='p', attrs={'class': 'byline'}), dict(name='div', attrs={'class': 'body_recap'}), dict(name='h1')]
|
||||
no_stylesheets = True
|
||||
|
||||
# Comment this out and configure process_index() to retrieve a single show
|
||||
feeds = [
|
||||
('Ltest Recaps',
|
||||
'http://www.televisionwithoutpity.com/rss.xml'),
|
||||
]
|
||||
|
||||
'''
|
||||
This method can be used to grab all recaps for a single show
|
||||
Set the SHOW constant at the beginning of this file to the URL for a show's recap page
|
||||
(the page listing all recaps, usually of the form:
|
||||
http://www.televisionwithoutpity.com/show/SHOW-NAME/recaps/"
|
||||
Where SHOW-NAME is the hyphenated name of the show.
|
||||
|
||||
To use:
|
||||
1. Comment out feeds = [...] earlier in this file
|
||||
2. Set the SHOW constant to the show's recap page
|
||||
3. Uncomment the following function
|
||||
'''
|
||||
|
||||
'''
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup(self.SHOW)
|
||||
feeds = []
|
||||
articles = []
|
||||
showTitle = soup.find('h1').string
|
||||
recaps = soup.find('table')
|
||||
for ep in recaps.findAll('tr'):
|
||||
epData = ep.findAll('td')
|
||||
epNum = epData[0].find(text=True).strip()
|
||||
if not epNum == "Ep.":
|
||||
epT = self.tag_to_string(epData[1].find('em')).strip()
|
||||
epST = " (or " + self.tag_to_string(epData[1].find('h3')).strip() + ")"
|
||||
epTitle = epNum + ": " + epT + epST
|
||||
epData[1].find('em').extract()
|
||||
epURL = epData[1].find('a', href=True)
|
||||
epURL = epURL['href']
|
||||
epSum = self.tag_to_string(epData[1].find('p')).strip()
|
||||
epDate = epData[2].find(text=True).strip()
|
||||
epAuthor = self.tag_to_string(epData[4].find('p')).strip()
|
||||
articles.append({'title':epTitle, 'url':epURL, 'description':epSum, 'date':epDate, 'author':epAuthor})
|
||||
feeds.append((showTitle, articles))
|
||||
#self.abort_recipe_processing("test")
|
||||
return feeds
|
||||
'''
|
||||
|
||||
# This will add subsequent pages of multipage recaps to a single article
|
||||
# page
|
||||
def append_page(self, soup, appendtag, position):
|
||||
# If false, will still grab single-page recaplets
|
||||
if (soup.find('p', attrs={'class': 'pages'})):
|
||||
pager = soup.find('p', attrs={'class': 'pages'}).find(text='Next')
|
||||
if pager:
|
||||
nexturl = pager.parent['href']
|
||||
soup2 = self.index_to_soup(nexturl)
|
||||
texttag = soup2.find('div', attrs={'class': 'body_recap'})
|
||||
for it in texttag.findAll(style=True):
|
||||
del it['style']
|
||||
newpos = len(texttag.contents)
|
||||
self.append_page(soup2, texttag, newpos)
|
||||
texttag.extract()
|
||||
appendtag.insert(position, texttag)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body, 3)
|
||||
return soup
|
||||
|
||||
# Remove the multi page links (we had to keep these in for append_page(), but they can go away now
|
||||
# Could have used CSS to hide, but some readers ignore CSS.
|
||||
def postprocess_html(self, soup, first_fetch):
|
||||
paginator = soup.findAll('p', attrs={'class': 'pages'})
|
||||
if paginator:
|
||||
for p in paginator:
|
||||
p.extract()
|
||||
|
||||
# TODO: Fix this so it converts the headline class into a heading 1
|
||||
return soup
|
Loading…
x
Reference in New Issue
Block a user