import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import browser
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.BeautifulSoup import Tag
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
title = u'New Musical Express Magazine'
description = 'UK Rock & Pop Mag.'
__author__ = 'Dave Asbury, Inge Aning'
category = 'Music, Film, Tv'
publisher = 'Time Inc. (UK) Ltd.'
'''
' updated 11/3/2015
' feeds url
' cover and masterhead url
' fix for a bug that prevents some pages render
' changes to website
'''
remove_empty_feeds = True
encoding = 'utf-8'
remove_javascript = True
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 20
auto_cleanup = False
language = 'en'
compress_news_images = True
simultaneous_downloads = 20
use_embedded_content = False
recursions = 0
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
feeds = [
(u'NME News', u'http://www.nme.com/rss/news'),
(u'Reviews', u'http://www.nme.com/rss/reviews'),
(u'Blogs', u'http://www.nme.com/rss/blogs'),
]
keep_only_tags = [
dict(name='div', attrs={'id': 'content'}),
]
remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan',
'valign', 'vspace', 'hspace', 'alt', 'width', 'height']
remove_tags = [
dict(name='meta'),
dict(name='span', attrs={'class': 'article_info'}),
dict(name='div', attrs={'class': 'breadcrumbs'}),
dict(name='div', attrs={'class': 'mugshot'}),
dict(name='div', attrs={'class': 'header'}),
dict(name='div', attrs={'class': re.compile(
'youtube.*', re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile(
'socialbuttons.*', re.IGNORECASE)}),
dict(name='div', attrs={'class': 'clear_both'}),
dict(name='div', attrs={'class': re.compile(
'headline.*', re.IGNORECASE)}),
dict(name='div', attrs={'class': 'member-signedout'}),
dict(name='div', attrs={'class': re.compile(
'prev_next.*', re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile(
'article_related.*', re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile(
'feature_bar.*', re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile('ebay.*', re.IGNORECASE)}),
dict(name='div', attrs={'id': re.compile(
'morenews.*', re.IGNORECASE)}),
dict(name='div', attrs={'id': re.compile(
'ticketspopup.*', re.IGNORECASE)}),
dict(name='div', attrs={'id': re.compile(
'ratemy_logprompt.*', re.IGNORECASE)}),
dict(name='div', attrs={'id': re.compile(
'related_artist.*', re.IGNORECASE)}),
dict(name='img', attrs={'class': re.compile(
'video_play_large.*', re.IGNORECASE)}),
dict(name='ul', attrs={'class': re.compile(
'prev_next.*', re.IGNORECASE)}),
dict(name='ul', attrs={'class': re.compile(
'nme_store.*', re.IGNORECASE)}),
dict(name='p', attrs={'class': re.compile('top', re.IGNORECASE)}),
dict(name='table', attrs={
'class': re.compile('tickets.*', re.IGNORECASE)}),
]
masthead_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg'
def get_cover_url(self):
magazine_page_raw = self.index_to_soup(
'http://www.nme.com/magazine', raw=True)
magazine_page_raw = re.sub(
r'', re.DOTALL | re.IGNORECASE), lambda h1: ''),
(re.compile(r'', re.IGNORECASE), lambda h2: ''),
(re.compile(r'p:not(.date){
font-weight:bold;
}
'''