mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Merge branch 'master' of https://github.com/CoderAllan/calibre
This commit is contained in:
commit
96947fd01a
@ -18,35 +18,6 @@ class EdgeConversationRSS(BasicNewsRecipe):
|
|||||||
oldest_article = 60
|
oldest_article = 60
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
keep_only_tags = [
|
feeds = [(u'Edge RSS', u'http://edge.org/feed')]
|
||||||
dict(name='div', attrs={'class': 'HomeLeftPannel IMGCTRL'})]
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='div', attrs={'class': 'Logo'})
|
|
||||||
]
|
|
||||||
|
|
||||||
feeds = [(u'Edge RSS', u'http://edge.org/feeds/')]
|
|
||||||
|
|
||||||
def print_version(self, url):
|
|
||||||
return url.replace('conversation/', 'conversation.php?cid=')
|
|
||||||
|
|
||||||
def parse_feeds(self):
|
|
||||||
|
|
||||||
# Call parent's method.
|
|
||||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
|
||||||
|
|
||||||
# Loop through all feeds.
|
|
||||||
for feed in feeds:
|
|
||||||
|
|
||||||
# Loop through all articles in feed.
|
|
||||||
for article in feed.articles[:]:
|
|
||||||
|
|
||||||
# Remove anything that is not a conversation, and remove PDF
|
|
||||||
# files as well...
|
|
||||||
|
|
||||||
if not ('CONVERSATION' in article.title):
|
|
||||||
feed.articles.remove(article)
|
|
||||||
elif 'pdf' in article.url:
|
|
||||||
feed.articles.remove(article)
|
|
||||||
|
|
||||||
return feeds
|
|
||||||
|
@ -1,53 +0,0 @@
|
|||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
|
|
||||||
|
|
||||||
class EuropeanVoice(BasicNewsRecipe):
|
|
||||||
title = u'European Voice'
|
|
||||||
__author__ = 'malfi'
|
|
||||||
oldest_article = 14
|
|
||||||
max_articles_per_feed = 100
|
|
||||||
no_stylesheets = True
|
|
||||||
cover_url = 'http://www.europeanvoice.com/Css/images/logo.gif'
|
|
||||||
language = 'en'
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id': 'articleLeftColumn'})]
|
|
||||||
remove_tags = [dict(name='div', attrs={'id': 'BreadCrump'})]
|
|
||||||
feeds = [
|
|
||||||
(u'Whole site ', u'http://www.europeanvoice.com/Rss/2.xml'),
|
|
||||||
(u'News and analysis', u'http://www.europeanvoice.com/Rss/6.xml'),
|
|
||||||
(u'Comment', u'http://www.europeanvoice.com/Rss/7.xml'),
|
|
||||||
(u'Special reports', u'http://www.europeanvoice.com/Rss/5.xml'),
|
|
||||||
(u'People', u'http://www.europeanvoice.com/Rss/8.xml'),
|
|
||||||
(u'Career', u'http://www.europeanvoice.com/Rss/11.xml'),
|
|
||||||
(u'Policies', u'http://www.europeanvoice.com/Rss/4.xml'),
|
|
||||||
(u'EVents', u'http://www.europeanvoice.com/Rss/10.xml'),
|
|
||||||
(u'Policies - Economics', u'http://www.europeanvoice.com/Rss/31.xml'),
|
|
||||||
(u'Policies - Business', u'http://www.europeanvoice.com/Rss/19.xml'),
|
|
||||||
(u'Policies - Trade', u'http://www.europeanvoice.com/Rss/25.xml'),
|
|
||||||
(u'Policies - Information society',
|
|
||||||
u'http://www.europeanvoice.com/Rss/20.xml'),
|
|
||||||
(u'Policies - Energy', u'http://www.europeanvoice.com/Rss/15.xml'),
|
|
||||||
(u'Policies - Transport', u'http://www.europeanvoice.com/Rss/18.xml'),
|
|
||||||
(u'Policies - Climate change', u'http://www.europeanvoice.com/Rss/16.xml'),
|
|
||||||
(u'Policies - Environment', u'http://www.europeanvoice.com/Rss/17.xml'),
|
|
||||||
(u'Policies - Farming & food', u'http://www.europeanvoice.com/Rss/23.xml'),
|
|
||||||
(u'Policies - Health & society', u'http://www.europeanvoice.com/Rss/24.xml'),
|
|
||||||
(u'Policies - Justice', u'http://www.europeanvoice.com/Rss/29.xml'),
|
|
||||||
(u'Policies - Foreign affairs', u'http://www.europeanvoice.com/Rss/27.xml')
|
|
||||||
]
|
|
||||||
extra_css = '''
|
|
||||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
|
||||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
|
||||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
|
||||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
|
||||||
'''
|
|
||||||
|
|
||||||
def print_version(self, url):
|
|
||||||
return url + '?bPrint=1'
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
denied = soup.findAll(True, text='Subscribers')
|
|
||||||
if denied:
|
|
||||||
raise Exception(
|
|
||||||
'Article skipped, because content can only be seen with subscription')
|
|
||||||
return soup
|
|
@ -15,29 +15,17 @@ class Kitsapsun(BasicNewsRecipe):
|
|||||||
publisher = 'Scripps Interactive Newspapers Group'
|
publisher = 'Scripps Interactive Newspapers Group'
|
||||||
category = 'news, Kitsap county, USA'
|
category = 'news, Kitsap county, USA'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
oldest_article = 2
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 50
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
conversion_options = {
|
feeds = [(u'News', u'http://www.kitsapsun.com/feeds/rss/news'),
|
||||||
'comments': description, 'tags': category, 'language': language, 'publisher': publisher
|
(u'Sports', u'http://www.kitsapsun.com/feeds/rss/sports'),
|
||||||
}
|
(u'Entertainment',
|
||||||
|
u'http://www.kitsapsun.com/feeds/rss/entertainment'),
|
||||||
keep_only_tags = [
|
(u'Lifestyles', u'http://www.kitsapsun.com/feeds/rss/lifestyle'),
|
||||||
dict(name='div', attrs={'id': ['story_meta', 'story_content']})]
|
(u'Opinion', u'http://www.kitsapsun.com/feeds/rss/opinion'),
|
||||||
|
]
|
||||||
remove_tags = [dict(name=['object', 'link', 'embed', 'form', 'iframe'])]
|
|
||||||
|
|
||||||
feeds = [
|
|
||||||
|
|
||||||
(u'News', u'http://www.kitsapsun.com/rss/headlines/news/'),
|
|
||||||
(u'Business', u'http://www.kitsapsun.com/rss/headlines/business/'),
|
|
||||||
(u'Communities', u'http://www.kitsapsun.com/rss/headlines/communities/'),
|
|
||||||
(u'Entertainment', u'http://www.kitsapsun.com/rss/headlines/entertainment/'),
|
|
||||||
(u'Lifestyles', u'http://www.kitsapsun.com/rss/headlines/lifestyles/')
|
|
||||||
]
|
|
||||||
|
|
||||||
def print_version(self, url):
|
|
||||||
return url.rpartition('/')[0] + '/?print=1'
|
|
||||||
|
@ -14,5 +14,5 @@ class HindustanTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
('News',
|
('News',
|
||||||
'http://www.mobilenations.com/rss/mb.xml'),
|
'http://www.mobilenations.com/about?format=RSS'),
|
||||||
]
|
]
|
||||||
|
@ -1,8 +1,4 @@
|
|||||||
import re
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre import browser
|
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
|
||||||
from calibre.ebooks.BeautifulSoup import Tag
|
|
||||||
|
|
||||||
|
|
||||||
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||||
@ -31,383 +27,10 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
|||||||
simultaneous_downloads = 20
|
simultaneous_downloads = 20
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
recursions = 0
|
recursions = 0
|
||||||
|
auto_cleanup = True
|
||||||
conversion_options = {
|
|
||||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
|
|
||||||
}
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'NME News', u'http://www.nme.com/rss/news'),
|
(u'NME News', u'http://www.nme.com/news/feed'),
|
||||||
(u'Reviews', u'http://www.nme.com/rss/reviews'),
|
(u'Reviews', u'http://www.nme.com/reviews/feed/'),
|
||||||
(u'Blogs', u'http://www.nme.com/rss/blogs'),
|
(u'Blogs', u'http://www.nme.com/blogs/feed'),
|
||||||
]
|
]
|
||||||
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(name='div', attrs={'id': 'content'}),
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan',
|
|
||||||
'valign', 'vspace', 'hspace', 'alt', 'width', 'height']
|
|
||||||
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='meta'),
|
|
||||||
dict(name='span', attrs={'class': 'article_info'}),
|
|
||||||
dict(name='div', attrs={'class': 'breadcrumbs'}),
|
|
||||||
dict(name='div', attrs={'class': 'mugshot'}),
|
|
||||||
dict(name='div', attrs={'class': 'header'}),
|
|
||||||
dict(name='div', attrs={'class': re.compile(
|
|
||||||
'youtube.*', re.IGNORECASE)}),
|
|
||||||
dict(name='div', attrs={'class': re.compile(
|
|
||||||
'socialbuttons.*', re.IGNORECASE)}),
|
|
||||||
dict(name='div', attrs={'class': 'clear_both'}),
|
|
||||||
dict(name='div', attrs={'class': re.compile(
|
|
||||||
'headline.*', re.IGNORECASE)}),
|
|
||||||
dict(name='div', attrs={'class': 'member-signedout'}),
|
|
||||||
dict(name='div', attrs={'class': re.compile(
|
|
||||||
'prev_next.*', re.IGNORECASE)}),
|
|
||||||
dict(name='div', attrs={'class': re.compile(
|
|
||||||
'article_related.*', re.IGNORECASE)}),
|
|
||||||
dict(name='div', attrs={'class': re.compile(
|
|
||||||
'feature_bar.*', re.IGNORECASE)}),
|
|
||||||
dict(name='div', attrs={'class': re.compile('ebay.*', re.IGNORECASE)}),
|
|
||||||
dict(name='div', attrs={'id': re.compile(
|
|
||||||
'morenews.*', re.IGNORECASE)}),
|
|
||||||
dict(name='div', attrs={'id': re.compile(
|
|
||||||
'ticketspopup.*', re.IGNORECASE)}),
|
|
||||||
dict(name='div', attrs={'id': re.compile(
|
|
||||||
'ratemy_logprompt.*', re.IGNORECASE)}),
|
|
||||||
dict(name='div', attrs={'id': re.compile(
|
|
||||||
'related_artist.*', re.IGNORECASE)}),
|
|
||||||
dict(name='img', attrs={'class': re.compile(
|
|
||||||
'video_play_large.*', re.IGNORECASE)}),
|
|
||||||
dict(name='ul', attrs={'class': re.compile(
|
|
||||||
'prev_next.*', re.IGNORECASE)}),
|
|
||||||
dict(name='ul', attrs={'class': re.compile(
|
|
||||||
'nme_store.*', re.IGNORECASE)}),
|
|
||||||
dict(name='p', attrs={'class': re.compile('top', re.IGNORECASE)}),
|
|
||||||
dict(name='table', attrs={
|
|
||||||
'class': re.compile('tickets.*', re.IGNORECASE)}),
|
|
||||||
]
|
|
||||||
|
|
||||||
masthead_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg'
|
|
||||||
|
|
||||||
def get_cover_url(self):
|
|
||||||
magazine_page_raw = self.index_to_soup(
|
|
||||||
'http://www.nme.com/magazine', raw=True)
|
|
||||||
magazine_page_raw = re.sub(
|
|
||||||
r'<script\b.+?</script>', '', magazine_page_raw, flags=re.DOTALL | re.IGNORECASE)
|
|
||||||
magazine_page_raw = re.sub(
|
|
||||||
r'\!\[if ', '!--[if ', magazine_page_raw, flags=re.DOTALL | re.IGNORECASE)
|
|
||||||
magazine_page = self.index_to_soup(magazine_page_raw)
|
|
||||||
cov = magazine_page.find('img', attrs={'class': 'magcover'})
|
|
||||||
|
|
||||||
cov2 = str(cov['src'])
|
|
||||||
|
|
||||||
br = browser()
|
|
||||||
br.set_handle_redirect(False)
|
|
||||||
try:
|
|
||||||
br.open_novisit(cov2)
|
|
||||||
cover_url = str(cov2)
|
|
||||||
except:
|
|
||||||
cover_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg'
|
|
||||||
return cover_url
|
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw_html, url):
|
|
||||||
'''
|
|
||||||
Need this for a bug on site that prevents blogg post being parsed correctly
|
|
||||||
'''
|
|
||||||
raw_html = re.sub(r'\!\[if ', '!--[if ', raw_html,
|
|
||||||
flags=re.DOTALL | re.IGNORECASE)
|
|
||||||
|
|
||||||
return raw_html
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
youtube_regex = re.compile(
|
|
||||||
r'(?:youtube\.com/(?:[^/]+/.+/|(?:v|e(?:mbed)?)/|.*[?&]v=)|youtu\.be/)(?P<id>[^"&?/ ]{11})', re.DOTALL | re.IGNORECASE)
|
|
||||||
instagram_regex = re.compile(
|
|
||||||
r'.*?instagram.*?', re.DOTALL | re.IGNORECASE)
|
|
||||||
twitter_regex = re.compile(r'.*?twitter.*?', re.DOTALL | re.IGNORECASE)
|
|
||||||
visualise_regex = re.compile(
|
|
||||||
r'.*?visualise.*?', re.DOTALL | re.IGNORECASE)
|
|
||||||
soundcloud_regex = re.compile(
|
|
||||||
r'(?P<url>.*?(w|api)\.soundcloud.*?com\/(tracks|playlists)\/\d{8,9})', re.DOTALL | re.IGNORECASE)
|
|
||||||
dailymotion_regex = re.compile(
|
|
||||||
r'.*?dailymotion.*?', re.DOTALL | re.IGNORECASE)
|
|
||||||
spotify_regex = re.compile(r'.*?spotify.*?', re.DOTALL | re.IGNORECASE)
|
|
||||||
vine_regex = re.compile(r'.*?vine.*?', re.DOTALL | re.IGNORECASE)
|
|
||||||
doubleHtmlEntities = re.compile(
|
|
||||||
ur'(&)(?P<e>[\d\w\#]*;)', re.DOTALL | re.IGNORECASE | re.UNICODE)
|
|
||||||
for iframe in soup.findAll('iframe'):
|
|
||||||
if iframe.has_key('src') and youtube_regex.search(iframe['src']) is not None: # noqa
|
|
||||||
pq = Tag(soup, 'blockquote')
|
|
||||||
br = Tag(soup, 'br')
|
|
||||||
pq.insert(0, '[ YouTube ]')
|
|
||||||
pq.insert(1, br)
|
|
||||||
m = youtube_regex.search(iframe['src'])
|
|
||||||
if m.group('id') is not None:
|
|
||||||
imgTag = Tag(soup, 'img', [
|
|
||||||
('src', 'http://img.youtube.com/vi/' + m.group('id') + '/0.jpg')])
|
|
||||||
pq.insert(len(pq.contents), imgTag)
|
|
||||||
pq.insert(len(pq.contents), iframe['src'])
|
|
||||||
iframe.replaceWith(pq)
|
|
||||||
elif iframe.has_key('src') and soundcloud_regex.search(iframe['src']) is not None: # noqa
|
|
||||||
m = soundcloud_regex.search(iframe['src'])
|
|
||||||
pq = Tag(soup, 'blockquote')
|
|
||||||
br = Tag(soup, 'br')
|
|
||||||
pq.insert(0, '[ SoundCloud ]')
|
|
||||||
pq.insert(1, br)
|
|
||||||
pq.insert(2, m.group('url'))
|
|
||||||
iframe.replaceWith(pq)
|
|
||||||
elif iframe.has_key('src') and dailymotion_regex.search(iframe['src']) is not None: # noqa
|
|
||||||
pq = Tag(soup, 'blockquote')
|
|
||||||
br = Tag(soup, 'br')
|
|
||||||
pq.insert(0, '[ DailyMotion ]')
|
|
||||||
pq.insert(1, br)
|
|
||||||
imgUrl = self.get_dailymotion_pic(iframe['src'])
|
|
||||||
if imgUrl is not None:
|
|
||||||
imgTag = Tag(soup, 'img', [('src', imgUrl)])
|
|
||||||
pq.insert(len(pq.contents), imgTag)
|
|
||||||
pq.insert(len(pq.contents), iframe['src'])
|
|
||||||
iframe.replaceWith(pq)
|
|
||||||
elif iframe.has_key('src') and spotify_regex.search(iframe['src']) is not None: # noqa
|
|
||||||
pq = Tag(soup, 'blockquote')
|
|
||||||
br = Tag(soup, 'br')
|
|
||||||
pq.insert(0, '[ Spotify ]')
|
|
||||||
pq.insert(1, br)
|
|
||||||
imgUrl = self.get_spotify_pic(iframe['src'])
|
|
||||||
if imgUrl is not None:
|
|
||||||
imgTag = Tag(soup, 'img', [('src', imgUrl)])
|
|
||||||
pq.insert(len(pq.contents), imgTag)
|
|
||||||
pq.insert(len(pq.contents), iframe['src'])
|
|
||||||
iframe.replaceWith(pq)
|
|
||||||
elif iframe.has_key('src') and vine_regex.search(iframe['src']) is not None: # noqa
|
|
||||||
pq = Tag(soup, 'blockquote')
|
|
||||||
br = Tag(soup, 'br')
|
|
||||||
pq.insert(0, '[ Vine ]')
|
|
||||||
pq.insert(1, br)
|
|
||||||
imgUrl = self.get_vine_pic(iframe['src'])
|
|
||||||
if imgUrl is not None:
|
|
||||||
imgTag = Tag(soup, 'img', [('src', imgUrl)])
|
|
||||||
pq.insert(len(pq.contents), imgTag)
|
|
||||||
pq.insert(len(pq.contents), iframe['src'])
|
|
||||||
iframe.replaceWith(pq)
|
|
||||||
elif iframe.has_key('src') and visualise_regex.search(iframe['src']) is not None: # noqa
|
|
||||||
imgUrl = self.get_visualise_pic(iframe['src'])
|
|
||||||
if imgUrl is not None:
|
|
||||||
imgTag = Tag(soup, 'img', [('src', imgUrl)])
|
|
||||||
iframe.replaceWith(imgTag)
|
|
||||||
for blockquote in soup.findAll('blockquote'):
|
|
||||||
if blockquote.has_key('class') and twitter_regex.search(blockquote['class']) is not None: # noqa
|
|
||||||
pq = Tag(soup, 'blockquote')
|
|
||||||
br = Tag(soup, 'br')
|
|
||||||
pq.insert(0, '[ Twitter ]')
|
|
||||||
pq.insert(len(pq.contents), br)
|
|
||||||
match = re.search(
|
|
||||||
"(?P<url>pic\.twitter[^\s<]+)", str(blockquote))
|
|
||||||
if match is not None:
|
|
||||||
img = self.get_twitter_pic(str(match.group("url")))
|
|
||||||
if img is not None:
|
|
||||||
pq.insert(len(pq.contents), img)
|
|
||||||
for p in blockquote.findAll(name='p'):
|
|
||||||
x = 0
|
|
||||||
plen = len(p.contents)
|
|
||||||
while True:
|
|
||||||
c = len(pq.contents)
|
|
||||||
if p.contents[x].string is not None:
|
|
||||||
pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
|
|
||||||
2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE))
|
|
||||||
else:
|
|
||||||
pq.insert(c, p.contents[x].content)
|
|
||||||
x += 1
|
|
||||||
if x == plen:
|
|
||||||
break
|
|
||||||
br = Tag(soup, 'br')
|
|
||||||
pq.insert(len(pq.contents), br)
|
|
||||||
p.extract()
|
|
||||||
if len(blockquote.contents) > 0:
|
|
||||||
x = 0
|
|
||||||
xlen = len(blockquote.contents)
|
|
||||||
while True:
|
|
||||||
c = len(pq.contents)
|
|
||||||
if blockquote.contents[x].string is not None:
|
|
||||||
pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
|
|
||||||
2), str(blockquote.contents[x].string), re.IGNORECASE | re.UNICODE))
|
|
||||||
else:
|
|
||||||
pq.insert(c, blockquote.contents[x].content)
|
|
||||||
x += 1
|
|
||||||
if x == xlen:
|
|
||||||
break
|
|
||||||
blockquote.replaceWith(pq)
|
|
||||||
elif blockquote.has_key('class') and instagram_regex.search(blockquote['class']) is not None: # noqa
|
|
||||||
pq = Tag(soup, 'blockquote')
|
|
||||||
br = Tag(soup, 'br')
|
|
||||||
pq.insert(0, '[ Instagram ]')
|
|
||||||
pq.insert(1, br)
|
|
||||||
a = blockquote.find(name='a', attrs={'href': instagram_regex})
|
|
||||||
imgUrl = None
|
|
||||||
if a is not None:
|
|
||||||
imgUrl = self.get_instagram_pic(str(a['href']))
|
|
||||||
if imgUrl is not None:
|
|
||||||
img = Tag(soup, 'img', [('src', imgUrl)])
|
|
||||||
pq.insert(len(pq.contents), img)
|
|
||||||
for p in blockquote.findAll(name='p'):
|
|
||||||
x = 0
|
|
||||||
plen = len(p.contents)
|
|
||||||
while x < plen:
|
|
||||||
c = len(pq.contents)
|
|
||||||
if p.contents[x].string is not None:
|
|
||||||
pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
|
|
||||||
2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE))
|
|
||||||
# else:
|
|
||||||
# pq.insert(c, p.contents[x].content)
|
|
||||||
x += 1
|
|
||||||
br = Tag(soup, 'br')
|
|
||||||
c = len(pq.contents)
|
|
||||||
pq.insert(c, br)
|
|
||||||
blockquote.replaceWith(pq)
|
|
||||||
for alink in soup.findAll('a'):
|
|
||||||
if alink.string is not None:
|
|
||||||
tstr = alink.string
|
|
||||||
alink.replaceWith(tstr)
|
|
||||||
elif alink.img is not None:
|
|
||||||
tstr = alink.img
|
|
||||||
alink.replaceWith(tstr)
|
|
||||||
elif alink.span is not None:
|
|
||||||
tstr = alink.span
|
|
||||||
alink.replaceWith(tstr)
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def get_visualise_pic(self, url):
|
|
||||||
returnValue = None
|
|
||||||
try:
|
|
||||||
raw = self.browser.open(url).read()
|
|
||||||
except:
|
|
||||||
print '404: ' + url
|
|
||||||
return returnValue
|
|
||||||
bs = BeautifulSoup(raw)
|
|
||||||
imgRaw = bs.find(name='meta', attrs={'property': 'og:image'})
|
|
||||||
if imgRaw is not None:
|
|
||||||
returnValue = str(imgRaw['content'])
|
|
||||||
return returnValue
|
|
||||||
|
|
||||||
def get_twitter_pic(self, url):
|
|
||||||
returnValue = None
|
|
||||||
try:
|
|
||||||
raw = self.browser.open('https://' + url).read()
|
|
||||||
except:
|
|
||||||
print '404: ' + url
|
|
||||||
return returnValue
|
|
||||||
bs = BeautifulSoup(raw)
|
|
||||||
refresh = bs.find('meta', {'http-equiv': 'refresh'})
|
|
||||||
if refresh is not None:
|
|
||||||
content = refresh.get('content').partition('=')[2]
|
|
||||||
try:
|
|
||||||
raw = self.browser.open(content).read()
|
|
||||||
except:
|
|
||||||
print '404: ' + url
|
|
||||||
return returnValue
|
|
||||||
bs = BeautifulSoup(raw)
|
|
||||||
img = bs.find(name='img', attrs={
|
|
||||||
'alt': re.compile('.*permalink.*', re.IGNORECASE)})
|
|
||||||
if img is not None:
|
|
||||||
returnValue = img
|
|
||||||
return returnValue
|
|
||||||
|
|
||||||
def get_soundcloud_pic(self, url):
|
|
||||||
# content loaded via javascript and require an login and/or registered application identification
|
|
||||||
# returnValue = None
|
|
||||||
# raw = self.browser.open(soundcloudUrl + '&visual=true').read()
|
|
||||||
# bs = BeautifulSoup(raw)
|
|
||||||
# imgRaw = bs.find(name='div', attrs={'style':re.compile(r'backgroud-image:*?',re.IGNORECASE)})
|
|
||||||
# if imgRaw is not None:
|
|
||||||
# returnValue = str(imgRaw['style'])
|
|
||||||
return None # returnValue
|
|
||||||
|
|
||||||
def get_instagram_pic(self, url):
|
|
||||||
returnValue = None
|
|
||||||
try:
|
|
||||||
raw = self.browser.open(url).read()
|
|
||||||
except:
|
|
||||||
print '404: ' + url
|
|
||||||
return returnValue
|
|
||||||
m = re.search('\"display_src\":\"(?P<url>http[s]?:.*?)\"', str(raw))
|
|
||||||
if m is not None:
|
|
||||||
returnValue = re.sub(r'\\', '', m.group(
|
|
||||||
"url"), flags=re.DOTALL | re.IGNORECASE)
|
|
||||||
return returnValue
|
|
||||||
|
|
||||||
def get_dailymotion_pic(self, url):
|
|
||||||
returnValue = None
|
|
||||||
try:
|
|
||||||
raw = self.browser.open(url).read()
|
|
||||||
except:
|
|
||||||
print '404: ' + url
|
|
||||||
return returnValue
|
|
||||||
m = re.search('("thumbnail_url\"\:\")(?P<url>http.*?)(\")', str(raw))
|
|
||||||
if m is not None:
|
|
||||||
returnValue = re.sub(r'\\', '', m.group(
|
|
||||||
"url"), flags=re.DOTALL | re.IGNORECASE)
|
|
||||||
return returnValue
|
|
||||||
|
|
||||||
def get_spotify_pic(self, url):
|
|
||||||
returnValue = None
|
|
||||||
try:
|
|
||||||
raw = self.browser.open(url).read()
|
|
||||||
except:
|
|
||||||
print '404: ' + url
|
|
||||||
return returnValue
|
|
||||||
m = re.search('data-ca=\"(?P<url>.*?)\"', str(raw))
|
|
||||||
if m is not None:
|
|
||||||
returnValue = m.group("url")
|
|
||||||
return returnValue
|
|
||||||
|
|
||||||
def get_vine_pic(self, url):
|
|
||||||
returnValue = None
|
|
||||||
try:
|
|
||||||
raw = self.browser.open(url).read()
|
|
||||||
except:
|
|
||||||
print '404: ' + url
|
|
||||||
return returnValue
|
|
||||||
m = re.search('"thumbnail.*?src=\"(?P<url>.*?)\"', str(raw))
|
|
||||||
if m is not None:
|
|
||||||
returnValue = m.group("url")
|
|
||||||
return returnValue
|
|
||||||
|
|
||||||
preprocess_regexps = [
|
|
||||||
(re.compile(r'<script\b.+?</script>', re.DOTALL | re.IGNORECASE), lambda h1: ''),
|
|
||||||
(re.compile(r'<a.* id="buy-tickets-button".*</a>', re.IGNORECASE), lambda h2: ''),
|
|
||||||
(re.compile(r'<a.* class="gallery.*</a>', re.IGNORECASE), lambda h2: ''),
|
|
||||||
]
|
|
||||||
|
|
||||||
extra_css = '''
|
|
||||||
h1 h2 {
|
|
||||||
font-family:Arial,Helvetica,sans-serif;
|
|
||||||
font-weight:bold;font-size:large;
|
|
||||||
}
|
|
||||||
h3 {
|
|
||||||
font-family:Arial,Helvetica,sans-serif;
|
|
||||||
font-weight:normal;
|
|
||||||
font-size:small;
|
|
||||||
font-style:italic;
|
|
||||||
display:inline;
|
|
||||||
}
|
|
||||||
body {
|
|
||||||
font-family:Helvetica,Arial,sans-serif;
|
|
||||||
font-size:small;
|
|
||||||
}
|
|
||||||
blockquote {
|
|
||||||
font-family:"Courier New",
|
|
||||||
Courier, monospace;
|
|
||||||
font-size:90%;
|
|
||||||
}
|
|
||||||
img {
|
|
||||||
display:block;
|
|
||||||
}
|
|
||||||
.date{
|
|
||||||
font-style:italic;
|
|
||||||
font-weight:normal;
|
|
||||||
}
|
|
||||||
.article_header>p:not(.date){
|
|
||||||
font-weight:bold;
|
|
||||||
}
|
|
||||||
'''
|
|
||||||
|
@ -1,22 +0,0 @@
|
|||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
|
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
|
|
||||||
|
|
||||||
class TheResurgence(BasicNewsRecipe):
|
|
||||||
title = u'The Resurgence'
|
|
||||||
__author__ = 'Peter Grungi'
|
|
||||||
language = 'en'
|
|
||||||
|
|
||||||
oldest_article = 7
|
|
||||||
max_articles_per_feed = 10
|
|
||||||
auto_cleanup = True
|
|
||||||
cover_url = 'http://cdn.theresurgence.com/images/logo.png'
|
|
||||||
masthead_url = 'http://cdn.theresurgence.com/images/logo.png'
|
|
||||||
language = 'en'
|
|
||||||
publisher = 'The Resurgence'
|
|
||||||
author = 'The Resurgence'
|
|
||||||
|
|
||||||
feeds = [
|
|
||||||
(u'The Resurgence', u'http://feeds.theresurgence.com/TheResurgence?format=xml')]
|
|
@ -10,30 +10,10 @@ class SecurityWatch(BasicNewsRecipe):
|
|||||||
oldest_article = 14
|
oldest_article = 14
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
filter_regexps = [r'feedads\.googleadservices\.com']
|
|
||||||
filter_regexps = [r'ad\.doubleclick']
|
|
||||||
filter_regexps = [r'advert']
|
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
auto_cleanup = True
|
||||||
extra_css = 'div {text-align:left}'
|
|
||||||
|
|
||||||
remove_tags = [dict(id='topBannerContainer'),
|
|
||||||
dict(id='topBannerSmall'),
|
|
||||||
dict(id='topSearchBar'),
|
|
||||||
dict(id='topSearchForm'),
|
|
||||||
dict(id='rtBannerMPU'),
|
|
||||||
dict(id='topNavBar'),
|
|
||||||
dict(id='breadcrumbs'),
|
|
||||||
# dict(id='entry-28272'),
|
|
||||||
dict(id='topSearchLinks'),
|
|
||||||
dict(name='span', attrs={'class': 'date'})]
|
|
||||||
|
|
||||||
remove_tags_after = [dict(id='googlemp')]
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'securitywatch', u'http://feeds.ziffdavisenterprise.com/RSS/security_watch/')]
|
(u'securitywatch',
|
||||||
|
u'http://feeds.pcmag.com/Rss.aspx/SectionArticles?sectionId=28026')
|
||||||
def postprocess_html(self, soup, first_fetch):
|
]
|
||||||
for t in soup.findAll(['table', 'tr', 'td']):
|
|
||||||
t.name = 'div'
|
|
||||||
return soup
|
|
||||||
|
@ -17,39 +17,38 @@ class AdvancedUserRecipe1315899507(BasicNewsRecipe):
|
|||||||
auto_cleanup = True
|
auto_cleanup = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
masthead_url = 'http://media.signonsandiego.com/e2/sosd/images/sosd_logo.png'
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Latest News', u'http://www.signonsandiego.com/rss/headlines/'),
|
(u'Latest News',
|
||||||
(u'Local News', u'http://www.signonsandiego.com/rss/headlines/metro/'),
|
u'http://www.sandiegouniontribune.com/latest/rss2.0.xml'),
|
||||||
(u'Business', u'http://www.signonsandiego.com/rss/headlines/business/'),
|
(u'Business',
|
||||||
(u'Politics', u'http://www.signonsandiego.com/rss/headlines/local/politics/'),
|
u'http://www.sandiegouniontribune.com/business/rss2.0.xml'),
|
||||||
(u'Border & Immigration', u'http://www.signonsandiego.com/rss/headlines/border/'),
|
(u'Politics',
|
||||||
(u'Courts', u'http://www.signonsandiego.com/rss/headlines/courts/'),
|
u'http://www.sandiegouniontribune.com/news/politics/rss2.0.xml'),
|
||||||
(u'Education', u'http://www.signonsandiego.com/news/education/'),
|
(u'Immigration',
|
||||||
(u'Sports', u'http://www.signonsandiego.com/rss/headlines/sports/'),
|
u'http://www.sandiegouniontribune.com/news/immigration/rss2.0.xml'),
|
||||||
(u'Chargers', u'http://www.signonsandiego.com/rss/headlines/sports/chargers/'),
|
(u'Courts',
|
||||||
(u'Padres', u'http://www.signonsandiego.com/rss/headlines/sports/padres/'),
|
u'http://www.sandiegouniontribune.com/news/public-safety/rss2.0.xml'),
|
||||||
(u'NFL', u'http://www.signonsandiego.com/rss/headlines/sports/nfl/'),
|
(u'Education',
|
||||||
(u'NBA', u'http://www.signonsandiego.com/rss/headlines/sports/nba/'),
|
u'http://www.sandiegouniontribune.com/news/education/rss2.0.xml'),
|
||||||
(u'Nick Canepa', u'http://www.signonsandiego.com/rss/authors/nick-canepa/'),
|
(u'Sports',
|
||||||
(u'Tim Sullivan', u'http://www.signonsandiego.com/rss/authors/tim-sullivan/'),
|
u'http://www.sandiegouniontribune.com/sports/rss2.0.xml'),
|
||||||
(u'Ruben Navarrette', u'http://www.signonsandiego.com/rss/authors/ruben-navarrette/'),
|
(u'Chargers',
|
||||||
(u'Diane Bell', u'http://www.signonsandiego.com/rss/authors/diane-bell/'),
|
u'http://www.sandiegouniontribune.com/sports/chargers/rss2.0.xml'),
|
||||||
(u'Smart Living', u'http://www.signonsandiego.com/rss/headlines/smart-living/'),
|
(u'Padres',
|
||||||
(u'Photos', u'http://www.signonsandiego.com/rss/photos/'),
|
u'http://www.sandiegouniontribune.com/sports/padres/rss2.0.xml'),
|
||||||
(u'Arts', u'http://www.signonsandiego.com/rss/headlines/night-and-day/theater-arts/'),
|
(u'NFL',
|
||||||
(u'Books', u'http://www.signonsandiego.com/rss/headlines/lifestyle/books/'),
|
u'http://www.sandiegouniontribune.com/sports/nfl/rss2.0.xml'),
|
||||||
(u'Currents-Passages',
|
(u'NBA',
|
||||||
u'http://www.signonsandiego.com/rss/headlines/lifestyle/currents/passages/'),
|
u'http://www.sandiegouniontribune.com/sports/nba/rss2.0.xml'),
|
||||||
(u'Currents-Weekend',
|
(u'Photos',
|
||||||
u'http://www.signonsandiego.com/news/rss2/daily/currentsweekend.xml'),
|
u'http://www.sandiegouniontribune.com/visuals/rss2.0.xml'),
|
||||||
(u'Dialog', u'http://www.signonsandiego.com/news/rss2/daily/dialog.xml'),
|
(u'Entertainment',
|
||||||
(u'Home', u'http://www.signonsandiego.com/rss/headlines/home/'),
|
u'http://www.sandiegouniontribune.com/entertainment/rss2.0.xml'),
|
||||||
(u'Homescape', u'http://www.signonsandiego.com/rss/headlines/lifestyle/homescape/'),
|
(u'Books',
|
||||||
(u'Night & Day', u'http://www.signonsandiego.com/news/rss2/daily/nightday.xml'),
|
u'http://www.sandiegouniontribune.com/entertainment/books/rss2.0.xml'),
|
||||||
(u'Opinion', u'http://www.signonsandiego.com/rss/headlines/opinion/'),
|
(u'Opinion',
|
||||||
(u'Quest', u'http://www.signonsandiego.com/news/rss2/daily/quest.xml'),
|
u'http://www.sandiegouniontribune.com/opinion/rss2.0.xml'),
|
||||||
(u'Travel', u'http://www.signonsandiego.com/news/rss2/daily/travel.xml'),
|
(u'Travel',
|
||||||
(u'Wheels', u'http://www.signonsandiego.com/news/rss2/daily/wheels.xml')
|
u'http://www.sandiegouniontribune.com/lifestyle/travel/rss2.0.xml'),
|
||||||
]
|
]
|
||||||
|
@ -28,5 +28,5 @@ class Starbulletin(BasicNewsRecipe):
|
|||||||
(u'Business', u'http://www.staradvertiser.com/business/feed/'),
|
(u'Business', u'http://www.staradvertiser.com/business/feed/'),
|
||||||
(u'Sports', u'http://www.staradvertiser.com/sports/feed/'),
|
(u'Sports', u'http://www.staradvertiser.com/sports/feed/'),
|
||||||
(u'Features',
|
(u'Features',
|
||||||
u'http://www.staradvertiser.com/featurespremium/index.rss')
|
u'http://www.staradvertiser.com/features/feed/')
|
||||||
]
|
]
|
||||||
|
@ -1,97 +0,0 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
class TelevisionWithoutPity(BasicNewsRecipe):
|
|
||||||
title = u'Television Without Pity'
|
|
||||||
language = 'en'
|
|
||||||
__author__ = 'Snarkastica'
|
|
||||||
# Used for pulling down an entire show, not just the RSS feed
|
|
||||||
SHOW = 'http://www.televisionwithoutpity.com/show/SHOW-NAME-HERE/recaps/'
|
|
||||||
oldest_article = 7 # days
|
|
||||||
max_articles_per_feed = 25
|
|
||||||
# reverse_article_order=True # Useful for entire show, to display in episode order
|
|
||||||
use_embedded_content = False
|
|
||||||
|
|
||||||
preprocess_regexps = [(re.compile(r'<span class="headline_recap_title .*?>',
|
|
||||||
re.DOTALL | re.IGNORECASE), lambda match: '<span class="headline_recap_title">')]
|
|
||||||
keep_only_tags = [dict(name='span', attrs={'class': 'headline_recap_title'}), dict(
|
|
||||||
name='p', attrs={'class': 'byline'}), dict(name='div', attrs={'class': 'body_recap'}), dict(name='h1')]
|
|
||||||
no_stylesheets = True
|
|
||||||
|
|
||||||
# Comment this out and configure process_index() to retrieve a single show
|
|
||||||
feeds = [
|
|
||||||
('Ltest Recaps',
|
|
||||||
'http://www.televisionwithoutpity.com/rss.xml'),
|
|
||||||
]
|
|
||||||
|
|
||||||
'''
|
|
||||||
This method can be used to grab all recaps for a single show
|
|
||||||
Set the SHOW constant at the beginning of this file to the URL for a show's recap page
|
|
||||||
(the page listing all recaps, usually of the form:
|
|
||||||
http://www.televisionwithoutpity.com/show/SHOW-NAME/recaps/"
|
|
||||||
Where SHOW-NAME is the hyphenated name of the show.
|
|
||||||
|
|
||||||
To use:
|
|
||||||
1. Comment out feeds = [...] earlier in this file
|
|
||||||
2. Set the SHOW constant to the show's recap page
|
|
||||||
3. Uncomment the following function
|
|
||||||
'''
|
|
||||||
|
|
||||||
'''
|
|
||||||
def parse_index(self):
|
|
||||||
soup = self.index_to_soup(self.SHOW)
|
|
||||||
feeds = []
|
|
||||||
articles = []
|
|
||||||
showTitle = soup.find('h1').string
|
|
||||||
recaps = soup.find('table')
|
|
||||||
for ep in recaps.findAll('tr'):
|
|
||||||
epData = ep.findAll('td')
|
|
||||||
epNum = epData[0].find(text=True).strip()
|
|
||||||
if not epNum == "Ep.":
|
|
||||||
epT = self.tag_to_string(epData[1].find('em')).strip()
|
|
||||||
epST = " (or " + self.tag_to_string(epData[1].find('h3')).strip() + ")"
|
|
||||||
epTitle = epNum + ": " + epT + epST
|
|
||||||
epData[1].find('em').extract()
|
|
||||||
epURL = epData[1].find('a', href=True)
|
|
||||||
epURL = epURL['href']
|
|
||||||
epSum = self.tag_to_string(epData[1].find('p')).strip()
|
|
||||||
epDate = epData[2].find(text=True).strip()
|
|
||||||
epAuthor = self.tag_to_string(epData[4].find('p')).strip()
|
|
||||||
articles.append({'title':epTitle, 'url':epURL, 'description':epSum, 'date':epDate, 'author':epAuthor})
|
|
||||||
feeds.append((showTitle, articles))
|
|
||||||
#self.abort_recipe_processing("test")
|
|
||||||
return feeds
|
|
||||||
'''
|
|
||||||
|
|
||||||
# This will add subsequent pages of multipage recaps to a single article
|
|
||||||
# page
|
|
||||||
def append_page(self, soup, appendtag, position):
|
|
||||||
# If false, will still grab single-page recaplets
|
|
||||||
if (soup.find('p', attrs={'class': 'pages'})):
|
|
||||||
pager = soup.find('p', attrs={'class': 'pages'}).find(text='Next')
|
|
||||||
if pager:
|
|
||||||
nexturl = pager.parent['href']
|
|
||||||
soup2 = self.index_to_soup(nexturl)
|
|
||||||
texttag = soup2.find('div', attrs={'class': 'body_recap'})
|
|
||||||
for it in texttag.findAll(style=True):
|
|
||||||
del it['style']
|
|
||||||
newpos = len(texttag.contents)
|
|
||||||
self.append_page(soup2, texttag, newpos)
|
|
||||||
texttag.extract()
|
|
||||||
appendtag.insert(position, texttag)
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
self.append_page(soup, soup.body, 3)
|
|
||||||
return soup
|
|
||||||
|
|
||||||
# Remove the multi page links (we had to keep these in for append_page(), but they can go away now
|
|
||||||
# Could have used CSS to hide, but some readers ignore CSS.
|
|
||||||
def postprocess_html(self, soup, first_fetch):
|
|
||||||
paginator = soup.findAll('p', attrs={'class': 'pages'})
|
|
||||||
if paginator:
|
|
||||||
for p in paginator:
|
|
||||||
p.extract()
|
|
||||||
|
|
||||||
# TODO: Fix this so it converts the headline class into a heading 1
|
|
||||||
return soup
|
|
Loading…
x
Reference in New Issue
Block a user