diff --git a/recipes/nme.recipe b/recipes/nme.recipe index b5f685b911..d65f7fe619 100644 --- a/recipes/nme.recipe +++ b/recipes/nme.recipe @@ -1,25 +1,91 @@ +import re from calibre.web.feeds.news import BasicNewsRecipe from calibre import browser +from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.ebooks.BeautifulSoup import Tag class AdvancedUserRecipe1306061239(BasicNewsRecipe): title = u'New Musical Express Magazine' - description = 'Author D.Asbury. UK Rock & Pop Mag. ' - __author__ = 'Dave Asbury' - # last updated 17/5/13 News feed url altered + description = 'UK Rock & Pop Mag.' + __author__ = 'Dave Asbury, Inge Aning' + category = 'Music, Film, Tv' + publisher = 'Time Inc. (UK) Ltd.' + ''' + ' updated 11/3/2015 + ' feeds url + ' cover and masterhead url + ' fix for a bug that prevents some pages render + ' changes to website + ''' + remove_empty_feeds = True - remove_javascript = True + encoding = 'utf-8' + remove_javascript = True no_stylesheets = True oldest_article = 7 max_articles_per_feed = 20 - #auto_cleanup = True - language = 'en_GB' + auto_cleanup = False + language = 'en' compress_news_images = True + simultaneous_downloads = 20 + use_embedded_content = False + recursions = 0 + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + feeds = [ + (u'NME News',u'http://www.nme.com/rss/news'), + (u'Reviews',u'http://www.nme.com/rss/reviews'), + (u'Blogs',u'http://www.nme.com/rss/blogs'), + ] + + keep_only_tags = [ + dict(name='div',attrs={'id':'content'}), + ] + + remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', + 'valign', 'vspace', 'hspace', 'alt', 'width', 'height'] + + remove_tags = [ + dict(name='meta'), + dict(name='span',attrs={'class':'article_info'}), + dict(name='div',attrs={'class':'breadcrumbs'}), + dict(name='div',attrs={'class':'mugshot'}), + dict(name='div',attrs={'class':'header'}), + dict(name='div',attrs={'class':re.compile('youtube.*',re.IGNORECASE)}), + dict(name='div',attrs={'class':re.compile('socialbuttons.*',re.IGNORECASE)}), + dict(name='div',attrs={'class':'clear_both'}), + dict(name='div',attrs={'class':re.compile('headline.*',re.IGNORECASE)}), + dict(name='div',attrs={'class':'member-signedout'}), + dict(name='div',attrs={'class':re.compile('prev_next.*',re.IGNORECASE)}), + dict(name='div',attrs={'class':re.compile('article_related.*',re.IGNORECASE)}), + dict(name='div',attrs={'class':re.compile('feature_bar.*',re.IGNORECASE)}), + dict(name='div',attrs={'class':re.compile('ebay.*',re.IGNORECASE)}), + dict(name='div',attrs={'id':re.compile('morenews.*',re.IGNORECASE)}), + dict(name='div',attrs={'id':re.compile('ticketspopup.*',re.IGNORECASE)}), + dict(name='div',attrs={'id':re.compile('ratemy_logprompt.*',re.IGNORECASE)}), + dict(name='div',attrs={'id':re.compile('related_artist.*',re.IGNORECASE)}), + dict(name='img',attrs={'class':re.compile('video_play_large.*',re.IGNORECASE)}), + dict(name='ul',attrs={'class':re.compile('prev_next.*',re.IGNORECASE)}), + dict(name='ul',attrs={'class':re.compile('nme_store.*',re.IGNORECASE)}), + dict(name='p',attrs={'class':re.compile('top',re.IGNORECASE)}), + dict(name='table',attrs={'class':re.compile('tickets.*',re.IGNORECASE)}), + ] + + masthead_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg' + def get_cover_url(self): - soup = self.index_to_soup('http://www.nme.com/component/subscribe') - cov = soup.find(attrs={'id' : 'magazine_cover'}) + magazine_page_raw = self.index_to_soup('http://www.nme.com/magazine', raw=True) + magazine_page_raw = re.sub(r'', '', magazine_page_raw, flags=re.DOTALL|re.IGNORECASE) + magazine_page_raw = re.sub(r'\!\[if ', '!--[if ', magazine_page_raw, flags=re.DOTALL|re.IGNORECASE) + magazine_page = self.index_to_soup(magazine_page_raw) + cov = magazine_page.find('img',attrs={'class':'magcover'}) cov2 = str(cov['src']) - # print '**** Cov url =*', cover_url,'***' - #print '**** Cov url =*','http://www.magazinesdirect.com/article_images/articledir_3138/1569221/1_largelisting.jpg','***' br = browser() br.set_handle_redirect(False) @@ -27,43 +93,291 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe): br.open_novisit(cov2) cover_url = str(cov2) except: - cover_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg' + cover_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg' return cover_url - masthead_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg' - remove_tags = [ - dict(attrs={'class':'clear_icons'}), - dict(attrs={'class':'share_links'}), - dict(attrs={'id':'right_panel'}), - dict(attrs={'class':'today box'}), + def preprocess_raw_html(self, raw_html, url): + ''' + Need this for a bug on site that prevents blogg post being parsed correctly + ''' + raw_html = re.sub(r'\!\[if ', '!--[if ', raw_html, flags=re.DOTALL|re.IGNORECASE) + return raw_html - ] + def preprocess_html(self, soup): + youtube_regex = re.compile(r'(?:youtube\.com/(?:[^/]+/.+/|(?:v|e(?:mbed)?)/|.*[?&]v=)|youtu\.be/)(?P[^"&?/ ]{11})', re.DOTALL|re.IGNORECASE) + instagram_regex = re.compile(r'.*?instagram.*?', re.DOTALL|re.IGNORECASE) + twitter_regex = re.compile(r'.*?twitter.*?', re.DOTALL|re.IGNORECASE) + visualise_regex = re.compile(r'.*?visualise.*?', re.DOTALL|re.IGNORECASE) + soundcloud_regex = re.compile(r'(?P.*?(w|api)\.soundcloud.*?com\/(tracks|playlists)\/\d{8,9})', re.DOTALL|re.IGNORECASE) + dailymotion_regex = re.compile(r'.*?dailymotion.*?', re.DOTALL|re.IGNORECASE) + spotify_regex = re.compile(r'.*?spotify.*?', re.DOTALL|re.IGNORECASE) + vine_regex = re.compile(r'.*?vine.*?', re.DOTALL|re.IGNORECASE) + doubleHtmlEntities = re.compile(ur'(&)(?P[\d\w\#]*;)', re.DOTALL|re.IGNORECASE|re.UNICODE) + for iframe in soup.findAll('iframe'): + if iframe.has_key('src') and youtube_regex.search(iframe['src']) is not None : + pq = Tag(soup, 'blockquote') + br = Tag(soup, 'br') + pq.insert(0, '[ YouTube ]') + pq.insert(1, br) + m = youtube_regex.search(iframe['src']) + if m.group('id') is not None: + imgTag = Tag(soup, 'img', [('src','http://img.youtube.com/vi/' + m.group('id') + '/0.jpg')]) + pq.insert(len(pq.contents), imgTag) + pq.insert(len(pq.contents), iframe['src']) + iframe.replaceWith(pq) + elif iframe.has_key('src') and soundcloud_regex.search(iframe['src']) is not None: + m = soundcloud_regex.search(iframe['src']) + pq = Tag(soup, 'blockquote') + br = Tag(soup, 'br') + pq.insert(0, '[ SoundCloud ]') + pq.insert(1, br) + pq.insert(2, m.group('url')) + #imgUrl = self.get_soundcloud_pic(iframe['src']) + iframe.replaceWith(pq) + elif iframe.has_key('src') and dailymotion_regex.search(iframe['src']) is not None: + pq = Tag(soup, 'blockquote') + br = Tag(soup, 'br') + pq.insert(0, '[ DailyMotion ]') + pq.insert(1, br) + imgUrl = self.get_dailymotion_pic(iframe['src']) + if imgUrl is not None: + imgTag = Tag(soup, 'img', [('src',imgUrl)]) + pq.insert(len(pq.contents), imgTag) + pq.insert(len(pq.contents), iframe['src']) + iframe.replaceWith(pq) + elif iframe.has_key('src') and spotify_regex.search(iframe['src']) is not None: + pq = Tag(soup, 'blockquote') + br = Tag(soup, 'br') + pq.insert(0, '[ Spotify ]') + pq.insert(1, br) + imgUrl = self.get_spotify_pic(iframe['src']) + if imgUrl is not None: + imgTag = Tag(soup, 'img', [('src',imgUrl)]) + pq.insert(len(pq.contents), imgTag) + pq.insert(len(pq.contents), iframe['src']) + iframe.replaceWith(pq) + elif iframe.has_key('src') and vine_regex.search(iframe['src']) is not None: + pq = Tag(soup, 'blockquote') + br = Tag(soup, 'br') + pq.insert(0, '[ Vine ]') + pq.insert(1, br) + imgUrl = self.get_vine_pic(iframe['src']) + if imgUrl is not None: + imgTag = Tag(soup, 'img', [('src',imgUrl)]) + pq.insert(len(pq.contents), imgTag) + pq.insert(len(pq.contents), iframe['src']) + iframe.replaceWith(pq) + elif iframe.has_key('src') and visualise_regex.search(iframe['src']) is not None : + imgUrl = self.get_visualise_pic(iframe['src']) + if imgUrl is not None: + imgTag = Tag(soup, 'img', [('src',imgUrl)]) + iframe.replaceWith(imgTag) + for blockquote in soup.findAll('blockquote'): + if blockquote.has_key('class') and twitter_regex.search(blockquote['class']) is not None: + pq = Tag(soup, 'blockquote') + br = Tag(soup, 'br') + pq.insert(0, '[ Twitter ]') + pq.insert(len(pq.contents), br) + match = re.search("(?Ppic\.twitter[^\s<]+)", str(blockquote)) + if match is not None: + img = self.get_twitter_pic(str(match.group("url"))) + if img is not None: + pq.insert(len(pq.contents),img) + for p in blockquote.findAll(name='p'): + x = 0 + plen = len(p.contents) + while True: + c = len(pq.contents) + if p.contents[x].string is not None: + pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE)) + else: + pq.insert(c, p.contents[x].content) + x += 1 + if x == plen: + break + br = Tag(soup, 'br') + pq.insert(len(pq.contents), br) + p.extract() + if len(blockquote.contents) > 0: + x = 0 + xlen = len(blockquote.contents) + while True: + c = len(pq.contents) + if blockquote.contents[x].string is not None: + pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(2), str(blockquote.contents[x].string), re.IGNORECASE | re.UNICODE)) + else: + pq.insert(c, blockquote.contents[x].content) + x += 1 + if x == xlen: + break + blockquote.replaceWith(pq) + elif blockquote.has_key('class') and instagram_regex.search(blockquote['class']) is not None: + pq = Tag(soup, 'blockquote') + br = Tag(soup, 'br') + pq.insert(0, '[ Instagram ]') + pq.insert(1, br) + a = blockquote.find(name='a',attrs={'href':instagram_regex}) + imgUrl = None + if a is not None: + imgUrl = self.get_instagram_pic(str(a['href'])) + if imgUrl is not None: + img = Tag(soup, 'img', [('src',imgUrl)]) + pq.insert(len(pq.contents),img) + for p in blockquote.findAll(name='p'): + x = 0 + plen = len(p.contents) + while x < plen: + c = len(pq.contents) + if p.contents[x].string is not None: + pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE)) + # else: + # pq.insert(c, p.contents[x].content) + x += 1 + br = Tag(soup, 'br') + c = len(pq.contents) + pq.insert(c, br) + blockquote.replaceWith(pq) + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + elif alink.img is not None: + tstr = alink.img + alink.replaceWith(tstr) + elif alink.span is not None: + tstr = alink.span + alink.replaceWith(tstr) + return soup - keep_only_tags = [ + def get_visualise_pic(self, url): + returnValue = None + try: + raw = self.browser.open(url).read() + except: + print '404: ' + url + return returnValue + bs = BeautifulSoup(raw) + imgRaw = bs.find(name='meta', attrs={'property':'og:image'}) + if imgRaw is not None: + returnValue = str(imgRaw['content']) + return returnValue - dict(name='h1'), - #dict(name='h3'), - dict(attrs={'class' : 'BText'}), - dict(attrs={'class' : 'Bmore'}), - dict(attrs={'class' : 'bPosts'}), - dict(attrs={'class' : 'text'}), - dict(attrs={'id' : 'article_gallery'}), - #dict(attrs={'class' : 'image'}), - dict(attrs={'class' : 'article_text'}) + def get_twitter_pic(self, url): + returnValue = None + try: + raw = self.browser.open('https://' + url).read() + except: + print '404: ' + url + return returnValue + bs = BeautifulSoup(raw) + refresh = bs.find('meta', {'http-equiv':'refresh'}) + if refresh is not None: + content = refresh.get('content').partition('=')[2] + try: + raw = self.browser.open(content).read() + except: + print '404: ' + url + return returnValue + bs = BeautifulSoup(raw) + img = bs.find(name='img',attrs={'alt':re.compile('.*permalink.*',re.IGNORECASE)}) + if img is not None: + returnValue = img + return returnValue + def get_soundcloud_pic(self, url): + # content loaded via javascript and require an login and/or registered application identification + # returnValue = None + # raw = self.browser.open(soundcloudUrl + '&visual=true').read() + # bs = BeautifulSoup(raw) + # imgRaw = bs.find(name='div', attrs={'style':re.compile(r'backgroud-image:*?',re.IGNORECASE)}) + # if imgRaw is not None: + # returnValue = str(imgRaw['style']) + return None # returnValue + + def get_instagram_pic(self, url): + returnValue = None + try: + raw = self.browser.open(url).read() + except: + print '404: ' + url + return returnValue + m = re.search('\"display_src\":\"(?Phttp[s]?:.*?)\"', str(raw)) + if m is not None: + returnValue = re.sub(r'\\','',m.group("url"), flags=re.DOTALL|re.IGNORECASE) + return returnValue + + def get_dailymotion_pic(self, url): + returnValue = None + try: + raw = self.browser.open(url).read() + except: + print '404: ' + url + return returnValue + m = re.search('("thumbnail_url\"\:\")(?Phttp.*?)(\")', str(raw)) + if m is not None: + returnValue = re.sub(r'\\','',m.group("url"), flags=re.DOTALL|re.IGNORECASE) + return returnValue + + def get_spotify_pic(self, url): + returnValue = None + try: + raw = self.browser.open(url).read() + except: + print '404: ' + url + return returnValue + m = re.search('data-ca=\"(?P.*?)\"', str(raw)) + if m is not None: + returnValue = m.group("url") + return returnValue + + def get_vine_pic(self, url): + returnValue = None + try: + raw = self.browser.open(url).read() + except: + print '404: ' + url + return returnValue + m = re.search('"thumbnail.*?src=\"(?P.*?)\"', str(raw)) + if m is not None: + returnValue = m.group("url") + return returnValue + + preprocess_regexps = [ + (re.compile(r'', re.DOTALL|re.IGNORECASE), lambda h1: ''), + (re.compile(r'',re.IGNORECASE), lambda h2: ''), + (re.compile(r'p:not(.date){ + font-weight:bold; + } + '''