mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
414 lines
17 KiB
Plaintext
414 lines
17 KiB
Plaintext
import re
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
from calibre import browser
|
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
|
from calibre.ebooks.BeautifulSoup import Tag
|
|
|
|
|
|
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
|
title = u'New Musical Express Magazine'
|
|
description = 'UK Rock & Pop Mag.'
|
|
__author__ = 'Dave Asbury, Inge Aning'
|
|
category = 'Music, Film, Tv'
|
|
publisher = 'Time Inc. (UK) Ltd.'
|
|
'''
|
|
' updated 11/3/2015
|
|
' feeds url
|
|
' cover and masterhead url
|
|
' fix for a bug that prevents some pages render
|
|
' changes to website
|
|
'''
|
|
|
|
remove_empty_feeds = True
|
|
encoding = 'utf-8'
|
|
remove_javascript = True
|
|
no_stylesheets = True
|
|
oldest_article = 7
|
|
max_articles_per_feed = 20
|
|
auto_cleanup = False
|
|
language = 'en'
|
|
compress_news_images = True
|
|
simultaneous_downloads = 20
|
|
use_embedded_content = False
|
|
recursions = 0
|
|
|
|
conversion_options = {
|
|
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
|
|
}
|
|
|
|
feeds = [
|
|
(u'NME News', u'http://www.nme.com/rss/news'),
|
|
(u'Reviews', u'http://www.nme.com/rss/reviews'),
|
|
(u'Blogs', u'http://www.nme.com/rss/blogs'),
|
|
]
|
|
|
|
keep_only_tags = [
|
|
dict(name='div', attrs={'id': 'content'}),
|
|
]
|
|
|
|
remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan',
|
|
'valign', 'vspace', 'hspace', 'alt', 'width', 'height']
|
|
|
|
remove_tags = [
|
|
dict(name='meta'),
|
|
dict(name='span', attrs={'class': 'article_info'}),
|
|
dict(name='div', attrs={'class': 'breadcrumbs'}),
|
|
dict(name='div', attrs={'class': 'mugshot'}),
|
|
dict(name='div', attrs={'class': 'header'}),
|
|
dict(name='div', attrs={'class': re.compile(
|
|
'youtube.*', re.IGNORECASE)}),
|
|
dict(name='div', attrs={'class': re.compile(
|
|
'socialbuttons.*', re.IGNORECASE)}),
|
|
dict(name='div', attrs={'class': 'clear_both'}),
|
|
dict(name='div', attrs={'class': re.compile(
|
|
'headline.*', re.IGNORECASE)}),
|
|
dict(name='div', attrs={'class': 'member-signedout'}),
|
|
dict(name='div', attrs={'class': re.compile(
|
|
'prev_next.*', re.IGNORECASE)}),
|
|
dict(name='div', attrs={'class': re.compile(
|
|
'article_related.*', re.IGNORECASE)}),
|
|
dict(name='div', attrs={'class': re.compile(
|
|
'feature_bar.*', re.IGNORECASE)}),
|
|
dict(name='div', attrs={'class': re.compile('ebay.*', re.IGNORECASE)}),
|
|
dict(name='div', attrs={'id': re.compile(
|
|
'morenews.*', re.IGNORECASE)}),
|
|
dict(name='div', attrs={'id': re.compile(
|
|
'ticketspopup.*', re.IGNORECASE)}),
|
|
dict(name='div', attrs={'id': re.compile(
|
|
'ratemy_logprompt.*', re.IGNORECASE)}),
|
|
dict(name='div', attrs={'id': re.compile(
|
|
'related_artist.*', re.IGNORECASE)}),
|
|
dict(name='img', attrs={'class': re.compile(
|
|
'video_play_large.*', re.IGNORECASE)}),
|
|
dict(name='ul', attrs={'class': re.compile(
|
|
'prev_next.*', re.IGNORECASE)}),
|
|
dict(name='ul', attrs={'class': re.compile(
|
|
'nme_store.*', re.IGNORECASE)}),
|
|
dict(name='p', attrs={'class': re.compile('top', re.IGNORECASE)}),
|
|
dict(name='table', attrs={
|
|
'class': re.compile('tickets.*', re.IGNORECASE)}),
|
|
]
|
|
|
|
masthead_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg'
|
|
|
|
def get_cover_url(self):
|
|
magazine_page_raw = self.index_to_soup(
|
|
'http://www.nme.com/magazine', raw=True)
|
|
magazine_page_raw = re.sub(
|
|
r'<script\b.+?</script>', '', magazine_page_raw, flags=re.DOTALL | re.IGNORECASE)
|
|
magazine_page_raw = re.sub(
|
|
r'\!\[if ', '!--[if ', magazine_page_raw, flags=re.DOTALL | re.IGNORECASE)
|
|
magazine_page = self.index_to_soup(magazine_page_raw)
|
|
cov = magazine_page.find('img', attrs={'class': 'magcover'})
|
|
|
|
cov2 = str(cov['src'])
|
|
|
|
br = browser()
|
|
br.set_handle_redirect(False)
|
|
try:
|
|
br.open_novisit(cov2)
|
|
cover_url = str(cov2)
|
|
except:
|
|
cover_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg'
|
|
return cover_url
|
|
|
|
def preprocess_raw_html(self, raw_html, url):
|
|
'''
|
|
Need this for a bug on site that prevents blogg post being parsed correctly
|
|
'''
|
|
raw_html = re.sub(r'\!\[if ', '!--[if ', raw_html,
|
|
flags=re.DOTALL | re.IGNORECASE)
|
|
|
|
return raw_html
|
|
|
|
def preprocess_html(self, soup):
|
|
youtube_regex = re.compile(
|
|
r'(?:youtube\.com/(?:[^/]+/.+/|(?:v|e(?:mbed)?)/|.*[?&]v=)|youtu\.be/)(?P<id>[^"&?/ ]{11})', re.DOTALL | re.IGNORECASE)
|
|
instagram_regex = re.compile(
|
|
r'.*?instagram.*?', re.DOTALL | re.IGNORECASE)
|
|
twitter_regex = re.compile(r'.*?twitter.*?', re.DOTALL | re.IGNORECASE)
|
|
visualise_regex = re.compile(
|
|
r'.*?visualise.*?', re.DOTALL | re.IGNORECASE)
|
|
soundcloud_regex = re.compile(
|
|
r'(?P<url>.*?(w|api)\.soundcloud.*?com\/(tracks|playlists)\/\d{8,9})', re.DOTALL | re.IGNORECASE)
|
|
dailymotion_regex = re.compile(
|
|
r'.*?dailymotion.*?', re.DOTALL | re.IGNORECASE)
|
|
spotify_regex = re.compile(r'.*?spotify.*?', re.DOTALL | re.IGNORECASE)
|
|
vine_regex = re.compile(r'.*?vine.*?', re.DOTALL | re.IGNORECASE)
|
|
doubleHtmlEntities = re.compile(
|
|
ur'(&)(?P<e>[\d\w\#]*;)', re.DOTALL | re.IGNORECASE | re.UNICODE)
|
|
for iframe in soup.findAll('iframe'):
|
|
if iframe.has_key('src') and youtube_regex.search(iframe['src']) is not None: # noqa
|
|
pq = Tag(soup, 'blockquote')
|
|
br = Tag(soup, 'br')
|
|
pq.insert(0, '[ YouTube ]')
|
|
pq.insert(1, br)
|
|
m = youtube_regex.search(iframe['src'])
|
|
if m.group('id') is not None:
|
|
imgTag = Tag(soup, 'img', [
|
|
('src', 'http://img.youtube.com/vi/' + m.group('id') + '/0.jpg')])
|
|
pq.insert(len(pq.contents), imgTag)
|
|
pq.insert(len(pq.contents), iframe['src'])
|
|
iframe.replaceWith(pq)
|
|
elif iframe.has_key('src') and soundcloud_regex.search(iframe['src']) is not None: # noqa
|
|
m = soundcloud_regex.search(iframe['src'])
|
|
pq = Tag(soup, 'blockquote')
|
|
br = Tag(soup, 'br')
|
|
pq.insert(0, '[ SoundCloud ]')
|
|
pq.insert(1, br)
|
|
pq.insert(2, m.group('url'))
|
|
iframe.replaceWith(pq)
|
|
elif iframe.has_key('src') and dailymotion_regex.search(iframe['src']) is not None: # noqa
|
|
pq = Tag(soup, 'blockquote')
|
|
br = Tag(soup, 'br')
|
|
pq.insert(0, '[ DailyMotion ]')
|
|
pq.insert(1, br)
|
|
imgUrl = self.get_dailymotion_pic(iframe['src'])
|
|
if imgUrl is not None:
|
|
imgTag = Tag(soup, 'img', [('src', imgUrl)])
|
|
pq.insert(len(pq.contents), imgTag)
|
|
pq.insert(len(pq.contents), iframe['src'])
|
|
iframe.replaceWith(pq)
|
|
elif iframe.has_key('src') and spotify_regex.search(iframe['src']) is not None: # noqa
|
|
pq = Tag(soup, 'blockquote')
|
|
br = Tag(soup, 'br')
|
|
pq.insert(0, '[ Spotify ]')
|
|
pq.insert(1, br)
|
|
imgUrl = self.get_spotify_pic(iframe['src'])
|
|
if imgUrl is not None:
|
|
imgTag = Tag(soup, 'img', [('src', imgUrl)])
|
|
pq.insert(len(pq.contents), imgTag)
|
|
pq.insert(len(pq.contents), iframe['src'])
|
|
iframe.replaceWith(pq)
|
|
elif iframe.has_key('src') and vine_regex.search(iframe['src']) is not None: # noqa
|
|
pq = Tag(soup, 'blockquote')
|
|
br = Tag(soup, 'br')
|
|
pq.insert(0, '[ Vine ]')
|
|
pq.insert(1, br)
|
|
imgUrl = self.get_vine_pic(iframe['src'])
|
|
if imgUrl is not None:
|
|
imgTag = Tag(soup, 'img', [('src', imgUrl)])
|
|
pq.insert(len(pq.contents), imgTag)
|
|
pq.insert(len(pq.contents), iframe['src'])
|
|
iframe.replaceWith(pq)
|
|
elif iframe.has_key('src') and visualise_regex.search(iframe['src']) is not None: # noqa
|
|
imgUrl = self.get_visualise_pic(iframe['src'])
|
|
if imgUrl is not None:
|
|
imgTag = Tag(soup, 'img', [('src', imgUrl)])
|
|
iframe.replaceWith(imgTag)
|
|
for blockquote in soup.findAll('blockquote'):
|
|
if blockquote.has_key('class') and twitter_regex.search(blockquote['class']) is not None: # noqa
|
|
pq = Tag(soup, 'blockquote')
|
|
br = Tag(soup, 'br')
|
|
pq.insert(0, '[ Twitter ]')
|
|
pq.insert(len(pq.contents), br)
|
|
match = re.search(
|
|
"(?P<url>pic\.twitter[^\s<]+)", str(blockquote))
|
|
if match is not None:
|
|
img = self.get_twitter_pic(str(match.group("url")))
|
|
if img is not None:
|
|
pq.insert(len(pq.contents), img)
|
|
for p in blockquote.findAll(name='p'):
|
|
x = 0
|
|
plen = len(p.contents)
|
|
while True:
|
|
c = len(pq.contents)
|
|
if p.contents[x].string is not None:
|
|
pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
|
|
2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE))
|
|
else:
|
|
pq.insert(c, p.contents[x].content)
|
|
x += 1
|
|
if x == plen:
|
|
break
|
|
br = Tag(soup, 'br')
|
|
pq.insert(len(pq.contents), br)
|
|
p.extract()
|
|
if len(blockquote.contents) > 0:
|
|
x = 0
|
|
xlen = len(blockquote.contents)
|
|
while True:
|
|
c = len(pq.contents)
|
|
if blockquote.contents[x].string is not None:
|
|
pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
|
|
2), str(blockquote.contents[x].string), re.IGNORECASE | re.UNICODE))
|
|
else:
|
|
pq.insert(c, blockquote.contents[x].content)
|
|
x += 1
|
|
if x == xlen:
|
|
break
|
|
blockquote.replaceWith(pq)
|
|
elif blockquote.has_key('class') and instagram_regex.search(blockquote['class']) is not None: # noqa
|
|
pq = Tag(soup, 'blockquote')
|
|
br = Tag(soup, 'br')
|
|
pq.insert(0, '[ Instagram ]')
|
|
pq.insert(1, br)
|
|
a = blockquote.find(name='a', attrs={'href': instagram_regex})
|
|
imgUrl = None
|
|
if a is not None:
|
|
imgUrl = self.get_instagram_pic(str(a['href']))
|
|
if imgUrl is not None:
|
|
img = Tag(soup, 'img', [('src', imgUrl)])
|
|
pq.insert(len(pq.contents), img)
|
|
for p in blockquote.findAll(name='p'):
|
|
x = 0
|
|
plen = len(p.contents)
|
|
while x < plen:
|
|
c = len(pq.contents)
|
|
if p.contents[x].string is not None:
|
|
pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
|
|
2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE))
|
|
# else:
|
|
# pq.insert(c, p.contents[x].content)
|
|
x += 1
|
|
br = Tag(soup, 'br')
|
|
c = len(pq.contents)
|
|
pq.insert(c, br)
|
|
blockquote.replaceWith(pq)
|
|
for alink in soup.findAll('a'):
|
|
if alink.string is not None:
|
|
tstr = alink.string
|
|
alink.replaceWith(tstr)
|
|
elif alink.img is not None:
|
|
tstr = alink.img
|
|
alink.replaceWith(tstr)
|
|
elif alink.span is not None:
|
|
tstr = alink.span
|
|
alink.replaceWith(tstr)
|
|
return soup
|
|
|
|
def get_visualise_pic(self, url):
|
|
returnValue = None
|
|
try:
|
|
raw = self.browser.open(url).read()
|
|
except:
|
|
print '404: ' + url
|
|
return returnValue
|
|
bs = BeautifulSoup(raw)
|
|
imgRaw = bs.find(name='meta', attrs={'property': 'og:image'})
|
|
if imgRaw is not None:
|
|
returnValue = str(imgRaw['content'])
|
|
return returnValue
|
|
|
|
def get_twitter_pic(self, url):
|
|
returnValue = None
|
|
try:
|
|
raw = self.browser.open('https://' + url).read()
|
|
except:
|
|
print '404: ' + url
|
|
return returnValue
|
|
bs = BeautifulSoup(raw)
|
|
refresh = bs.find('meta', {'http-equiv': 'refresh'})
|
|
if refresh is not None:
|
|
content = refresh.get('content').partition('=')[2]
|
|
try:
|
|
raw = self.browser.open(content).read()
|
|
except:
|
|
print '404: ' + url
|
|
return returnValue
|
|
bs = BeautifulSoup(raw)
|
|
img = bs.find(name='img', attrs={
|
|
'alt': re.compile('.*permalink.*', re.IGNORECASE)})
|
|
if img is not None:
|
|
returnValue = img
|
|
return returnValue
|
|
|
|
def get_soundcloud_pic(self, url):
|
|
# content loaded via javascript and require an login and/or registered application identification
|
|
# returnValue = None
|
|
# raw = self.browser.open(soundcloudUrl + '&visual=true').read()
|
|
# bs = BeautifulSoup(raw)
|
|
# imgRaw = bs.find(name='div', attrs={'style':re.compile(r'backgroud-image:*?',re.IGNORECASE)})
|
|
# if imgRaw is not None:
|
|
# returnValue = str(imgRaw['style'])
|
|
return None # returnValue
|
|
|
|
def get_instagram_pic(self, url):
|
|
returnValue = None
|
|
try:
|
|
raw = self.browser.open(url).read()
|
|
except:
|
|
print '404: ' + url
|
|
return returnValue
|
|
m = re.search('\"display_src\":\"(?P<url>http[s]?:.*?)\"', str(raw))
|
|
if m is not None:
|
|
returnValue = re.sub(r'\\', '', m.group(
|
|
"url"), flags=re.DOTALL | re.IGNORECASE)
|
|
return returnValue
|
|
|
|
def get_dailymotion_pic(self, url):
|
|
returnValue = None
|
|
try:
|
|
raw = self.browser.open(url).read()
|
|
except:
|
|
print '404: ' + url
|
|
return returnValue
|
|
m = re.search('("thumbnail_url\"\:\")(?P<url>http.*?)(\")', str(raw))
|
|
if m is not None:
|
|
returnValue = re.sub(r'\\', '', m.group(
|
|
"url"), flags=re.DOTALL | re.IGNORECASE)
|
|
return returnValue
|
|
|
|
def get_spotify_pic(self, url):
|
|
returnValue = None
|
|
try:
|
|
raw = self.browser.open(url).read()
|
|
except:
|
|
print '404: ' + url
|
|
return returnValue
|
|
m = re.search('data-ca=\"(?P<url>.*?)\"', str(raw))
|
|
if m is not None:
|
|
returnValue = m.group("url")
|
|
return returnValue
|
|
|
|
def get_vine_pic(self, url):
|
|
returnValue = None
|
|
try:
|
|
raw = self.browser.open(url).read()
|
|
except:
|
|
print '404: ' + url
|
|
return returnValue
|
|
m = re.search('"thumbnail.*?src=\"(?P<url>.*?)\"', str(raw))
|
|
if m is not None:
|
|
returnValue = m.group("url")
|
|
return returnValue
|
|
|
|
preprocess_regexps = [
|
|
(re.compile(r'<script\b.+?</script>', re.DOTALL | re.IGNORECASE), lambda h1: ''),
|
|
(re.compile(r'<a.* id="buy-tickets-button".*</a>', re.IGNORECASE), lambda h2: ''),
|
|
(re.compile(r'<a.* class="gallery.*</a>', re.IGNORECASE), lambda h2: ''),
|
|
]
|
|
|
|
extra_css = '''
|
|
h1 h2 {
|
|
font-family:Arial,Helvetica,sans-serif;
|
|
font-weight:bold;font-size:large;
|
|
}
|
|
h3 {
|
|
font-family:Arial,Helvetica,sans-serif;
|
|
font-weight:normal;
|
|
font-size:small;
|
|
font-style:italic;
|
|
display:inline;
|
|
}
|
|
body {
|
|
font-family:Helvetica,Arial,sans-serif;
|
|
font-size:small;
|
|
}
|
|
blockquote {
|
|
font-family:"Courier New",
|
|
Courier, monospace;
|
|
font-size:90%;
|
|
}
|
|
img {
|
|
display:block;
|
|
}
|
|
.date{
|
|
font-style:italic;
|
|
font-weight:normal;
|
|
}
|
|
.article_header>p:not(.date){
|
|
font-weight:bold;
|
|
}
|
|
'''
|