Update New Musical Express

2025-10-16 19:41:07 -04:00 · 2015-03-12 08:42:43 +05:30 · 2015-03-12 08:42:43 +05:30 · 6f986d0488
commit 6f986d0488
parent aafeb58d58
1 changed files with 354 additions and 40 deletions
--- a/recipes/nme.recipe
+++ b/recipes/nme.recipe
@ -1,25 +1,91 @@
+import re
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre import browser
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.ebooks.BeautifulSoup import Tag
 class AdvancedUserRecipe1306061239(BasicNewsRecipe):
    title          = u'New Musical Express Magazine'
-    description = 'Author D.Asbury. UK Rock & Pop Mag. '
-    __author__ = 'Dave Asbury'
-    # last updated 17/5/13 News feed url altered
+    description = 'UK Rock & Pop Mag.'
+    __author__ = 'Dave Asbury, Inge Aning'
+    category = 'Music, Film, Tv'
+    publisher = 'Time Inc. (UK) Ltd.'
+    '''
+    ' updated 11/3/2015
+    ' feeds url
+    ' cover and masterhead url
+    ' fix for a bug that prevents some pages render
+    ' changes to website
+    '''
+
    remove_empty_feeds = True
-    remove_javascript     = True
+    encoding  = 'utf-8'
+    remove_javascript = True
    no_stylesheets = True
    oldest_article = 7
    max_articles_per_feed = 20
-    #auto_cleanup = True
-    language = 'en_GB'
+    auto_cleanup = False
+    language = 'en'
    compress_news_images = True
+    simultaneous_downloads  = 20
+    use_embedded_content = False
+    recursions = 0
+
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+    }
+
+    feeds          = [
+                    (u'NME News',u'http://www.nme.com/rss/news'),
+                    (u'Reviews',u'http://www.nme.com/rss/reviews'),
+                    (u'Blogs',u'http://www.nme.com/rss/blogs'),
+    ]
+
+    keep_only_tags = [
+        dict(name='div',attrs={'id':'content'}),
+    ]
+
+    remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan',
+                          'valign', 'vspace', 'hspace', 'alt', 'width', 'height']
+
+    remove_tags = [
+            dict(name='meta'),
+            dict(name='span',attrs={'class':'article_info'}),
+            dict(name='div',attrs={'class':'breadcrumbs'}),
+            dict(name='div',attrs={'class':'mugshot'}),
+            dict(name='div',attrs={'class':'header'}),
+            dict(name='div',attrs={'class':re.compile('youtube.*',re.IGNORECASE)}),
+            dict(name='div',attrs={'class':re.compile('socialbuttons.*',re.IGNORECASE)}),
+            dict(name='div',attrs={'class':'clear_both'}),
+            dict(name='div',attrs={'class':re.compile('headline.*',re.IGNORECASE)}),
+            dict(name='div',attrs={'class':'member-signedout'}),
+            dict(name='div',attrs={'class':re.compile('prev_next.*',re.IGNORECASE)}),
+            dict(name='div',attrs={'class':re.compile('article_related.*',re.IGNORECASE)}),
+            dict(name='div',attrs={'class':re.compile('feature_bar.*',re.IGNORECASE)}),
+            dict(name='div',attrs={'class':re.compile('ebay.*',re.IGNORECASE)}),
+            dict(name='div',attrs={'id':re.compile('morenews.*',re.IGNORECASE)}),
+            dict(name='div',attrs={'id':re.compile('ticketspopup.*',re.IGNORECASE)}),
+            dict(name='div',attrs={'id':re.compile('ratemy_logprompt.*',re.IGNORECASE)}),
+            dict(name='div',attrs={'id':re.compile('related_artist.*',re.IGNORECASE)}),
+            dict(name='img',attrs={'class':re.compile('video_play_large.*',re.IGNORECASE)}),
+            dict(name='ul',attrs={'class':re.compile('prev_next.*',re.IGNORECASE)}),
+            dict(name='ul',attrs={'class':re.compile('nme_store.*',re.IGNORECASE)}),
+            dict(name='p',attrs={'class':re.compile('top',re.IGNORECASE)}),
+            dict(name='table',attrs={'class':re.compile('tickets.*',re.IGNORECASE)}),
+    ]
+
+    masthead_url   = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg'
+
    def get_cover_url(self):
-        soup = self.index_to_soup('http://www.nme.com/component/subscribe')
-        cov = soup.find(attrs={'id' : 'magazine_cover'})
+        magazine_page_raw = self.index_to_soup('http://www.nme.com/magazine', raw=True)
+        magazine_page_raw = re.sub(r'<script\b.+?</script>', '', magazine_page_raw, flags=re.DOTALL|re.IGNORECASE)
+        magazine_page_raw = re.sub(r'\!\[if ', '!--[if ', magazine_page_raw, flags=re.DOTALL|re.IGNORECASE)
+        magazine_page = self.index_to_soup(magazine_page_raw)
+        cov = magazine_page.find('img',attrs={'class':'magcover'})

        cov2 = str(cov['src'])
-        # print '**** Cov url =*', cover_url,'***'
-        #print '**** Cov url =*','http://www.magazinesdirect.com/article_images/articledir_3138/1569221/1_largelisting.jpg','***'

        br = browser()
        br.set_handle_redirect(False)
@ -27,43 +93,291 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
            br.open_novisit(cov2)
            cover_url = str(cov2)
        except:
-                cover_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'
+            cover_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg'
        return cover_url
-    masthead_url   = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'

-    remove_tags = [
-            dict(attrs={'class':'clear_icons'}),
-            dict(attrs={'class':'share_links'}),
-                            dict(attrs={'id':'right_panel'}),
-            dict(attrs={'class':'today box'}),
+    def preprocess_raw_html(self, raw_html, url):
+        '''
+        Need this for a bug on site that prevents blogg post being parsed correctly
+        '''
+        raw_html = re.sub(r'\!\[if ', '!--[if ', raw_html, flags=re.DOTALL|re.IGNORECASE)

+        return raw_html

-                      ]
+    def preprocess_html(self, soup):
+        youtube_regex = re.compile(r'(?:youtube\.com/(?:[^/]+/.+/|(?:v|e(?:mbed)?)/|.*[?&]v=)|youtu\.be/)(?P<id>[^"&?/ ]{11})', re.DOTALL|re.IGNORECASE)
+        instagram_regex = re.compile(r'.*?instagram.*?', re.DOTALL|re.IGNORECASE)
+        twitter_regex = re.compile(r'.*?twitter.*?', re.DOTALL|re.IGNORECASE)
+        visualise_regex = re.compile(r'.*?visualise.*?', re.DOTALL|re.IGNORECASE)
+        soundcloud_regex = re.compile(r'(?P<url>.*?(w|api)\.soundcloud.*?com\/(tracks|playlists)\/\d{8,9})', re.DOTALL|re.IGNORECASE)
+        dailymotion_regex = re.compile(r'.*?dailymotion.*?', re.DOTALL|re.IGNORECASE)
+        spotify_regex = re.compile(r'.*?spotify.*?', re.DOTALL|re.IGNORECASE)
+        vine_regex = re.compile(r'.*?vine.*?', re.DOTALL|re.IGNORECASE)
+        doubleHtmlEntities = re.compile(ur'(&amp;)(?P<e>[\d\w\#]*;)', re.DOTALL|re.IGNORECASE|re.UNICODE)
+        for iframe in soup.findAll('iframe'):
+            if iframe.has_key('src') and youtube_regex.search(iframe['src']) is not None :
+                pq = Tag(soup, 'blockquote')
+                br = Tag(soup, 'br')
+                pq.insert(0, '[ YouTube ]')
+                pq.insert(1, br)
+                m = youtube_regex.search(iframe['src'])
+                if m.group('id') is not None:
+                    imgTag = Tag(soup, 'img', [('src','http://img.youtube.com/vi/' + m.group('id') + '/0.jpg')])
+                    pq.insert(len(pq.contents), imgTag)
+                pq.insert(len(pq.contents), iframe['src'])
+                iframe.replaceWith(pq)
+            elif iframe.has_key('src') and soundcloud_regex.search(iframe['src']) is not None:
+                m = soundcloud_regex.search(iframe['src'])
+                pq = Tag(soup, 'blockquote')
+                br = Tag(soup, 'br')
+                pq.insert(0, '[ SoundCloud ]')
+                pq.insert(1, br)
+                pq.insert(2, m.group('url'))
+                #imgUrl = self.get_soundcloud_pic(iframe['src'])
+                iframe.replaceWith(pq)
+            elif iframe.has_key('src') and dailymotion_regex.search(iframe['src']) is not None:
+                pq = Tag(soup, 'blockquote')
+                br = Tag(soup, 'br')
+                pq.insert(0, '[ DailyMotion ]')
+                pq.insert(1, br)
+                imgUrl = self.get_dailymotion_pic(iframe['src'])
+                if imgUrl is not None:
+                    imgTag = Tag(soup, 'img', [('src',imgUrl)])
+                    pq.insert(len(pq.contents), imgTag)
+                pq.insert(len(pq.contents), iframe['src'])
+                iframe.replaceWith(pq)
+            elif iframe.has_key('src') and spotify_regex.search(iframe['src']) is not None:
+                pq = Tag(soup, 'blockquote')
+                br = Tag(soup, 'br')
+                pq.insert(0, '[ Spotify ]')
+                pq.insert(1, br)
+                imgUrl = self.get_spotify_pic(iframe['src'])
+                if imgUrl is not None:
+                    imgTag = Tag(soup, 'img', [('src',imgUrl)])
+                    pq.insert(len(pq.contents), imgTag)
+                pq.insert(len(pq.contents), iframe['src'])
+                iframe.replaceWith(pq)
+            elif iframe.has_key('src') and vine_regex.search(iframe['src']) is not None:
+                pq = Tag(soup, 'blockquote')
+                br = Tag(soup, 'br')
+                pq.insert(0, '[ Vine ]')
+                pq.insert(1, br)
+                imgUrl = self.get_vine_pic(iframe['src'])
+                if imgUrl is not None:
+                    imgTag = Tag(soup, 'img', [('src',imgUrl)])
+                    pq.insert(len(pq.contents), imgTag)
+                pq.insert(len(pq.contents), iframe['src'])
+                iframe.replaceWith(pq)
+            elif iframe.has_key('src') and visualise_regex.search(iframe['src']) is not None :
+                imgUrl = self.get_visualise_pic(iframe['src'])
+                if imgUrl is not None:
+                    imgTag = Tag(soup, 'img', [('src',imgUrl)])
+                    iframe.replaceWith(imgTag)
+        for blockquote in soup.findAll('blockquote'):
+            if blockquote.has_key('class') and twitter_regex.search(blockquote['class']) is not None:
+                pq = Tag(soup, 'blockquote')
+                br = Tag(soup, 'br')
+                pq.insert(0, '[ Twitter ]')
+                pq.insert(len(pq.contents), br)
+                match = re.search("(?P<url>pic\.twitter[^\s<]+)", str(blockquote))
+                if match is not None:
+                    img = self.get_twitter_pic(str(match.group("url")))
+                    if img is not None:
+                        pq.insert(len(pq.contents),img)
+                for p in blockquote.findAll(name='p'):
+                    x = 0
+                    plen = len(p.contents)
+                    while True:
+                        c = len(pq.contents)
+                        if p.contents[x].string is not None:
+                            pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE))
+                        else:
+                            pq.insert(c, p.contents[x].content)
+                        x += 1
+                        if x == plen:
+                            break
+                    br = Tag(soup, 'br')
+                    pq.insert(len(pq.contents), br)
+                    p.extract()
+                if len(blockquote.contents) > 0:
+                    x = 0
+                    xlen = len(blockquote.contents)
+                    while True:
+                        c = len(pq.contents)
+                        if blockquote.contents[x].string is not None:
+                            pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(2), str(blockquote.contents[x].string), re.IGNORECASE | re.UNICODE))
+                        else:
+                            pq.insert(c, blockquote.contents[x].content)
+                        x += 1
+                        if x == xlen:
+                            break
+                blockquote.replaceWith(pq)
+            elif blockquote.has_key('class') and instagram_regex.search(blockquote['class']) is not None:
+                pq = Tag(soup, 'blockquote')
+                br = Tag(soup, 'br')
+                pq.insert(0, '[ Instagram ]')
+                pq.insert(1, br)
+                a = blockquote.find(name='a',attrs={'href':instagram_regex})
+                imgUrl = None
+                if a is not None:
+                    imgUrl = self.get_instagram_pic(str(a['href']))
+                if imgUrl is not None:
+                    img = Tag(soup, 'img', [('src',imgUrl)])
+                    pq.insert(len(pq.contents),img)
+                for p in blockquote.findAll(name='p'):
+                    x = 0
+                    plen = len(p.contents)
+                    while x < plen:
+                        c = len(pq.contents)
+                        if p.contents[x].string is not None:
+                            pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE))
+                        # else:
+                            # pq.insert(c, p.contents[x].content)
+                        x += 1
+                    br = Tag(soup, 'br')
+                    c = len(pq.contents)
+                    pq.insert(c, br)
+                blockquote.replaceWith(pq)
+        for alink in soup.findAll('a'):
+            if alink.string is not None:
+                tstr = alink.string
+                alink.replaceWith(tstr)
+            elif alink.img is not None:
+                tstr = alink.img
+                alink.replaceWith(tstr)
+            elif alink.span is not None:
+                tstr = alink.span
+                alink.replaceWith(tstr)
+        return soup

-    keep_only_tags = [
+    def get_visualise_pic(self, url):
+        returnValue = None
+        try:
+            raw = self.browser.open(url).read()
+        except:
+            print '404: ' + url
+            return returnValue
+        bs = BeautifulSoup(raw)
+        imgRaw = bs.find(name='meta', attrs={'property':'og:image'})
+        if imgRaw is not None:
+            returnValue = str(imgRaw['content'])
+        return returnValue

-        dict(name='h1'),
-        #dict(name='h3'),
-        dict(attrs={'class' :  'BText'}),
-        dict(attrs={'class' :  'Bmore'}),
-        dict(attrs={'class' : 'bPosts'}),
-        dict(attrs={'class' :  'text'}),
-        dict(attrs={'id' :  'article_gallery'}),
-                        #dict(attrs={'class' :  'image'}),
-        dict(attrs={'class' :  'article_text'})
+    def get_twitter_pic(self, url):
+        returnValue = None
+        try:
+            raw = self.browser.open('https://' + url).read()
+        except:
+            print '404: ' + url
+            return returnValue
+        bs = BeautifulSoup(raw)
+        refresh = bs.find('meta', {'http-equiv':'refresh'})
+        if refresh is not None:
+            content = refresh.get('content').partition('=')[2]
+            try:
+                raw = self.browser.open(content).read()
+            except:
+                print '404: ' + url
+                return returnValue
+            bs = BeautifulSoup(raw)
+        img = bs.find(name='img',attrs={'alt':re.compile('.*permalink.*',re.IGNORECASE)})
+        if img is not None:
+            returnValue = img
+        return returnValue

+    def get_soundcloud_pic(self, url):
+        # content loaded via javascript and require an login and/or registered application identification
+        # returnValue = None
+        # raw = self.browser.open(soundcloudUrl + '&visual=true').read()
+        # bs = BeautifulSoup(raw)
+        # imgRaw = bs.find(name='div', attrs={'style':re.compile(r'backgroud-image:*?',re.IGNORECASE)})
+        # if imgRaw is not None:
+            # returnValue = str(imgRaw['style'])
+        return None  # returnValue
+
+    def get_instagram_pic(self, url):
+        returnValue = None
+        try:
+            raw = self.browser.open(url).read()
+        except:
+            print '404: ' + url
+            return returnValue
+        m = re.search('\"display_src\":\"(?P<url>http[s]?:.*?)\"', str(raw))
+        if m is not None:
+            returnValue = re.sub(r'\\','',m.group("url"), flags=re.DOTALL|re.IGNORECASE)
+        return returnValue
+
+    def get_dailymotion_pic(self, url):
+        returnValue = None
+        try:
+            raw = self.browser.open(url).read()
+        except:
+            print '404: ' + url
+            return returnValue
+        m = re.search('("thumbnail_url\"\:\")(?P<url>http.*?)(\")', str(raw))
+        if m is not None:
+            returnValue = re.sub(r'\\','',m.group("url"), flags=re.DOTALL|re.IGNORECASE)
+        return returnValue
+
+    def get_spotify_pic(self, url):
+        returnValue = None
+        try:
+            raw = self.browser.open(url).read()
+        except:
+            print '404: ' + url
+            return returnValue
+        m = re.search('data-ca=\"(?P<url>.*?)\"', str(raw))
+        if m is not None:
+            returnValue = m.group("url")
+        return returnValue
+
+    def get_vine_pic(self, url):
+        returnValue = None
+        try:
+            raw = self.browser.open(url).read()
+        except:
+            print '404: ' + url
+            return returnValue
+        m = re.search('"thumbnail.*?src=\"(?P<url>.*?)\"', str(raw))
+        if m is not None:
+            returnValue = m.group("url")
+        return returnValue
+
+    preprocess_regexps = [
+        (re.compile(r'<script\b.+?</script>', re.DOTALL|re.IGNORECASE), lambda h1: ''),
+        (re.compile(r'<a.* id="buy-tickets-button".*</a>',re.IGNORECASE), lambda h2: ''),
+        (re.compile(r'<a.* class="gallery.*</a>',re.IGNORECASE), lambda h2: ''),
    ]

-    feeds          = [
-        (u'NME News', u'http://www.nme.com/news?alt=rss' ), #http://feeds.feedburner.com/nmecom/rss/newsxml?format=xml'),
-        #(u'Reviews', u'http://feeds2.feedburner.com/nme/SdML'),
-        (u'Reviews',u'http://feed43.com/1817687144061333.xml'),
-                        (u'Bloggs',u'http://feed43.com/3326754333186048.xml'),
-
-	]
    extra_css = '''
-                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
-                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
-                    p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
-                    body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
-		'''
+                    h1 h2 {
+                        font-family:Arial,Helvetica,sans-serif;
+                        font-weight:bold;font-size:large;
+                    }
+                    h3 {
+                        font-family:Arial,Helvetica,sans-serif;
+                        font-weight:normal;
+                        font-size:small;
+                        font-style:italic;
+                        display:inline;
+                    }
+                    body {
+                        font-family:Helvetica,Arial,sans-serif;
+                        font-size:small;
+                    }
+                    blockquote {
+                        font-family:"Courier New",
+                        Courier, monospace;
+                        font-size:90%;
+                    }
+                    img {
+                        display:block;
+                    }
+                    .date{
+                        font-style:italic;
+                        font-weight:normal;
+                    }
+                    .article_header>p:not(.date){
+                        font-weight:bold;
+                    }
+                '''