import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import browser
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.BeautifulSoup import Tag
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
title = u'New Musical Express Magazine'
description = 'UK Rock & Pop Mag.'
__author__ = 'Dave Asbury, Inge Aning'
category = 'Music, Film, Tv'
publisher = 'Time Inc. (UK) Ltd.'
'''
' updated 11/3/2015
' feeds url
' cover and masterhead url
' fix for a bug that prevents some pages render
' changes to website
'''
remove_empty_feeds = True
encoding = 'utf-8'
remove_javascript = True
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 20
auto_cleanup = False
language = 'en'
compress_news_images = True
simultaneous_downloads = 20
use_embedded_content = False
recursions = 0
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
feeds = [
(u'NME News',u'http://www.nme.com/rss/news'),
(u'Reviews',u'http://www.nme.com/rss/reviews'),
(u'Blogs',u'http://www.nme.com/rss/blogs'),
]
keep_only_tags = [
dict(name='div',attrs={'id':'content'}),
]
remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan',
'valign', 'vspace', 'hspace', 'alt', 'width', 'height']
remove_tags = [
dict(name='meta'),
dict(name='span',attrs={'class':'article_info'}),
dict(name='div',attrs={'class':'breadcrumbs'}),
dict(name='div',attrs={'class':'mugshot'}),
dict(name='div',attrs={'class':'header'}),
dict(name='div',attrs={'class':re.compile('youtube.*',re.IGNORECASE)}),
dict(name='div',attrs={'class':re.compile('socialbuttons.*',re.IGNORECASE)}),
dict(name='div',attrs={'class':'clear_both'}),
dict(name='div',attrs={'class':re.compile('headline.*',re.IGNORECASE)}),
dict(name='div',attrs={'class':'member-signedout'}),
dict(name='div',attrs={'class':re.compile('prev_next.*',re.IGNORECASE)}),
dict(name='div',attrs={'class':re.compile('article_related.*',re.IGNORECASE)}),
dict(name='div',attrs={'class':re.compile('feature_bar.*',re.IGNORECASE)}),
dict(name='div',attrs={'class':re.compile('ebay.*',re.IGNORECASE)}),
dict(name='div',attrs={'id':re.compile('morenews.*',re.IGNORECASE)}),
dict(name='div',attrs={'id':re.compile('ticketspopup.*',re.IGNORECASE)}),
dict(name='div',attrs={'id':re.compile('ratemy_logprompt.*',re.IGNORECASE)}),
dict(name='div',attrs={'id':re.compile('related_artist.*',re.IGNORECASE)}),
dict(name='img',attrs={'class':re.compile('video_play_large.*',re.IGNORECASE)}),
dict(name='ul',attrs={'class':re.compile('prev_next.*',re.IGNORECASE)}),
dict(name='ul',attrs={'class':re.compile('nme_store.*',re.IGNORECASE)}),
dict(name='p',attrs={'class':re.compile('top',re.IGNORECASE)}),
dict(name='table',attrs={'class':re.compile('tickets.*',re.IGNORECASE)}),
]
masthead_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg'
def get_cover_url(self):
magazine_page_raw = self.index_to_soup('http://www.nme.com/magazine', raw=True)
magazine_page_raw = re.sub(r'', re.DOTALL|re.IGNORECASE), lambda h1: ''),
(re.compile(r'',re.IGNORECASE), lambda h2: ''),
(re.compile(r'p:not(.date){
font-weight:bold;
}
'''