import re from calibre.web.feeds.news import BasicNewsRecipe from calibre import browser from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import Tag class AdvancedUserRecipe1306061239(BasicNewsRecipe): title = u'New Musical Express Magazine' description = 'UK Rock & Pop Mag.' __author__ = 'Dave Asbury, Inge Aning' category = 'Music, Film, Tv' publisher = 'Time Inc. (UK) Ltd.' ''' ' updated 11/3/2015 ' feeds url ' cover and masterhead url ' fix for a bug that prevents some pages render ' changes to website ''' remove_empty_feeds = True encoding = 'utf-8' remove_javascript = True no_stylesheets = True oldest_article = 7 max_articles_per_feed = 20 auto_cleanup = False language = 'en' compress_news_images = True simultaneous_downloads = 20 use_embedded_content = False recursions = 0 conversion_options = { 'comment': description, 'tags': category, 'publisher': publisher, 'language': language } feeds = [ (u'NME News', u'http://www.nme.com/rss/news'), (u'Reviews', u'http://www.nme.com/rss/reviews'), (u'Blogs', u'http://www.nme.com/rss/blogs'), ] keep_only_tags = [ dict(name='div', attrs={'id': 'content'}), ] remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height'] remove_tags = [ dict(name='meta'), dict(name='span', attrs={'class': 'article_info'}), dict(name='div', attrs={'class': 'breadcrumbs'}), dict(name='div', attrs={'class': 'mugshot'}), dict(name='div', attrs={'class': 'header'}), dict(name='div', attrs={'class': re.compile( 'youtube.*', re.IGNORECASE)}), dict(name='div', attrs={'class': re.compile( 'socialbuttons.*', re.IGNORECASE)}), dict(name='div', attrs={'class': 'clear_both'}), dict(name='div', attrs={'class': re.compile( 'headline.*', re.IGNORECASE)}), dict(name='div', attrs={'class': 'member-signedout'}), dict(name='div', attrs={'class': re.compile( 'prev_next.*', re.IGNORECASE)}), dict(name='div', attrs={'class': re.compile( 'article_related.*', re.IGNORECASE)}), dict(name='div', attrs={'class': re.compile( 'feature_bar.*', re.IGNORECASE)}), dict(name='div', attrs={'class': re.compile('ebay.*', re.IGNORECASE)}), dict(name='div', attrs={'id': re.compile( 'morenews.*', re.IGNORECASE)}), dict(name='div', attrs={'id': re.compile( 'ticketspopup.*', re.IGNORECASE)}), dict(name='div', attrs={'id': re.compile( 'ratemy_logprompt.*', re.IGNORECASE)}), dict(name='div', attrs={'id': re.compile( 'related_artist.*', re.IGNORECASE)}), dict(name='img', attrs={'class': re.compile( 'video_play_large.*', re.IGNORECASE)}), dict(name='ul', attrs={'class': re.compile( 'prev_next.*', re.IGNORECASE)}), dict(name='ul', attrs={'class': re.compile( 'nme_store.*', re.IGNORECASE)}), dict(name='p', attrs={'class': re.compile('top', re.IGNORECASE)}), dict(name='table', attrs={ 'class': re.compile('tickets.*', re.IGNORECASE)}), ] masthead_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg' def get_cover_url(self): magazine_page_raw = self.index_to_soup( 'http://www.nme.com/magazine', raw=True) magazine_page_raw = re.sub( r'', '', magazine_page_raw, flags=re.DOTALL | re.IGNORECASE) magazine_page_raw = re.sub( r'\!\[if ', '!--[if ', magazine_page_raw, flags=re.DOTALL | re.IGNORECASE) magazine_page = self.index_to_soup(magazine_page_raw) cov = magazine_page.find('img', attrs={'class': 'magcover'}) cov2 = str(cov['src']) br = browser() br.set_handle_redirect(False) try: br.open_novisit(cov2) cover_url = str(cov2) except: cover_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg' return cover_url def preprocess_raw_html(self, raw_html, url): ''' Need this for a bug on site that prevents blogg post being parsed correctly ''' raw_html = re.sub(r'\!\[if ', '!--[if ', raw_html, flags=re.DOTALL | re.IGNORECASE) return raw_html def preprocess_html(self, soup): youtube_regex = re.compile( r'(?:youtube\.com/(?:[^/]+/.+/|(?:v|e(?:mbed)?)/|.*[?&]v=)|youtu\.be/)(?P[^"&?/ ]{11})', re.DOTALL | re.IGNORECASE) instagram_regex = re.compile( r'.*?instagram.*?', re.DOTALL | re.IGNORECASE) twitter_regex = re.compile(r'.*?twitter.*?', re.DOTALL | re.IGNORECASE) visualise_regex = re.compile( r'.*?visualise.*?', re.DOTALL | re.IGNORECASE) soundcloud_regex = re.compile( r'(?P.*?(w|api)\.soundcloud.*?com\/(tracks|playlists)\/\d{8,9})', re.DOTALL | re.IGNORECASE) dailymotion_regex = re.compile( r'.*?dailymotion.*?', re.DOTALL | re.IGNORECASE) spotify_regex = re.compile(r'.*?spotify.*?', re.DOTALL | re.IGNORECASE) vine_regex = re.compile(r'.*?vine.*?', re.DOTALL | re.IGNORECASE) doubleHtmlEntities = re.compile( ur'(&)(?P[\d\w\#]*;)', re.DOTALL | re.IGNORECASE | re.UNICODE) for iframe in soup.findAll('iframe'): if iframe.has_key('src') and youtube_regex.search(iframe['src']) is not None: # noqa pq = Tag(soup, 'blockquote') br = Tag(soup, 'br') pq.insert(0, '[ YouTube ]') pq.insert(1, br) m = youtube_regex.search(iframe['src']) if m.group('id') is not None: imgTag = Tag(soup, 'img', [ ('src', 'http://img.youtube.com/vi/' + m.group('id') + '/0.jpg')]) pq.insert(len(pq.contents), imgTag) pq.insert(len(pq.contents), iframe['src']) iframe.replaceWith(pq) elif iframe.has_key('src') and soundcloud_regex.search(iframe['src']) is not None: # noqa m = soundcloud_regex.search(iframe['src']) pq = Tag(soup, 'blockquote') br = Tag(soup, 'br') pq.insert(0, '[ SoundCloud ]') pq.insert(1, br) pq.insert(2, m.group('url')) iframe.replaceWith(pq) elif iframe.has_key('src') and dailymotion_regex.search(iframe['src']) is not None: # noqa pq = Tag(soup, 'blockquote') br = Tag(soup, 'br') pq.insert(0, '[ DailyMotion ]') pq.insert(1, br) imgUrl = self.get_dailymotion_pic(iframe['src']) if imgUrl is not None: imgTag = Tag(soup, 'img', [('src', imgUrl)]) pq.insert(len(pq.contents), imgTag) pq.insert(len(pq.contents), iframe['src']) iframe.replaceWith(pq) elif iframe.has_key('src') and spotify_regex.search(iframe['src']) is not None: # noqa pq = Tag(soup, 'blockquote') br = Tag(soup, 'br') pq.insert(0, '[ Spotify ]') pq.insert(1, br) imgUrl = self.get_spotify_pic(iframe['src']) if imgUrl is not None: imgTag = Tag(soup, 'img', [('src', imgUrl)]) pq.insert(len(pq.contents), imgTag) pq.insert(len(pq.contents), iframe['src']) iframe.replaceWith(pq) elif iframe.has_key('src') and vine_regex.search(iframe['src']) is not None: # noqa pq = Tag(soup, 'blockquote') br = Tag(soup, 'br') pq.insert(0, '[ Vine ]') pq.insert(1, br) imgUrl = self.get_vine_pic(iframe['src']) if imgUrl is not None: imgTag = Tag(soup, 'img', [('src', imgUrl)]) pq.insert(len(pq.contents), imgTag) pq.insert(len(pq.contents), iframe['src']) iframe.replaceWith(pq) elif iframe.has_key('src') and visualise_regex.search(iframe['src']) is not None: # noqa imgUrl = self.get_visualise_pic(iframe['src']) if imgUrl is not None: imgTag = Tag(soup, 'img', [('src', imgUrl)]) iframe.replaceWith(imgTag) for blockquote in soup.findAll('blockquote'): if blockquote.has_key('class') and twitter_regex.search(blockquote['class']) is not None: # noqa pq = Tag(soup, 'blockquote') br = Tag(soup, 'br') pq.insert(0, '[ Twitter ]') pq.insert(len(pq.contents), br) match = re.search( "(?Ppic\.twitter[^\s<]+)", str(blockquote)) if match is not None: img = self.get_twitter_pic(str(match.group("url"))) if img is not None: pq.insert(len(pq.contents), img) for p in blockquote.findAll(name='p'): x = 0 plen = len(p.contents) while True: c = len(pq.contents) if p.contents[x].string is not None: pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group( 2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE)) else: pq.insert(c, p.contents[x].content) x += 1 if x == plen: break br = Tag(soup, 'br') pq.insert(len(pq.contents), br) p.extract() if len(blockquote.contents) > 0: x = 0 xlen = len(blockquote.contents) while True: c = len(pq.contents) if blockquote.contents[x].string is not None: pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group( 2), str(blockquote.contents[x].string), re.IGNORECASE | re.UNICODE)) else: pq.insert(c, blockquote.contents[x].content) x += 1 if x == xlen: break blockquote.replaceWith(pq) elif blockquote.has_key('class') and instagram_regex.search(blockquote['class']) is not None: # noqa pq = Tag(soup, 'blockquote') br = Tag(soup, 'br') pq.insert(0, '[ Instagram ]') pq.insert(1, br) a = blockquote.find(name='a', attrs={'href': instagram_regex}) imgUrl = None if a is not None: imgUrl = self.get_instagram_pic(str(a['href'])) if imgUrl is not None: img = Tag(soup, 'img', [('src', imgUrl)]) pq.insert(len(pq.contents), img) for p in blockquote.findAll(name='p'): x = 0 plen = len(p.contents) while x < plen: c = len(pq.contents) if p.contents[x].string is not None: pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group( 2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE)) # else: # pq.insert(c, p.contents[x].content) x += 1 br = Tag(soup, 'br') c = len(pq.contents) pq.insert(c, br) blockquote.replaceWith(pq) for alink in soup.findAll('a'): if alink.string is not None: tstr = alink.string alink.replaceWith(tstr) elif alink.img is not None: tstr = alink.img alink.replaceWith(tstr) elif alink.span is not None: tstr = alink.span alink.replaceWith(tstr) return soup def get_visualise_pic(self, url): returnValue = None try: raw = self.browser.open(url).read() except: print '404: ' + url return returnValue bs = BeautifulSoup(raw) imgRaw = bs.find(name='meta', attrs={'property': 'og:image'}) if imgRaw is not None: returnValue = str(imgRaw['content']) return returnValue def get_twitter_pic(self, url): returnValue = None try: raw = self.browser.open('https://' + url).read() except: print '404: ' + url return returnValue bs = BeautifulSoup(raw) refresh = bs.find('meta', {'http-equiv': 'refresh'}) if refresh is not None: content = refresh.get('content').partition('=')[2] try: raw = self.browser.open(content).read() except: print '404: ' + url return returnValue bs = BeautifulSoup(raw) img = bs.find(name='img', attrs={ 'alt': re.compile('.*permalink.*', re.IGNORECASE)}) if img is not None: returnValue = img return returnValue def get_soundcloud_pic(self, url): # content loaded via javascript and require an login and/or registered application identification # returnValue = None # raw = self.browser.open(soundcloudUrl + '&visual=true').read() # bs = BeautifulSoup(raw) # imgRaw = bs.find(name='div', attrs={'style':re.compile(r'backgroud-image:*?',re.IGNORECASE)}) # if imgRaw is not None: # returnValue = str(imgRaw['style']) return None # returnValue def get_instagram_pic(self, url): returnValue = None try: raw = self.browser.open(url).read() except: print '404: ' + url return returnValue m = re.search('\"display_src\":\"(?Phttp[s]?:.*?)\"', str(raw)) if m is not None: returnValue = re.sub(r'\\', '', m.group( "url"), flags=re.DOTALL | re.IGNORECASE) return returnValue def get_dailymotion_pic(self, url): returnValue = None try: raw = self.browser.open(url).read() except: print '404: ' + url return returnValue m = re.search('("thumbnail_url\"\:\")(?Phttp.*?)(\")', str(raw)) if m is not None: returnValue = re.sub(r'\\', '', m.group( "url"), flags=re.DOTALL | re.IGNORECASE) return returnValue def get_spotify_pic(self, url): returnValue = None try: raw = self.browser.open(url).read() except: print '404: ' + url return returnValue m = re.search('data-ca=\"(?P.*?)\"', str(raw)) if m is not None: returnValue = m.group("url") return returnValue def get_vine_pic(self, url): returnValue = None try: raw = self.browser.open(url).read() except: print '404: ' + url return returnValue m = re.search('"thumbnail.*?src=\"(?P.*?)\"', str(raw)) if m is not None: returnValue = m.group("url") return returnValue preprocess_regexps = [ (re.compile(r'', re.DOTALL | re.IGNORECASE), lambda h1: ''), (re.compile(r'', re.IGNORECASE), lambda h2: ''), (re.compile(r'p:not(.date){ font-weight:bold; } '''