calibre/recipes/nme.recipe

import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import browser
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.BeautifulSoup import Tag


class AdvancedUserRecipe1306061239(BasicNewsRecipe):
    title = u'New Musical Express Magazine'
    description = 'UK Rock & Pop Mag.'
    __author__ = 'Dave Asbury, Inge Aning'
    category = 'Music, Film, Tv'
    publisher = 'Time Inc. (UK) Ltd.'
    '''
    ' updated 11/3/2015
    ' feeds url
    ' cover and masterhead url
    ' fix for a bug that prevents some pages render
    ' changes to website
    '''

    remove_empty_feeds = True
    encoding = 'utf-8'
    remove_javascript = True
    no_stylesheets = True
    oldest_article = 7
    max_articles_per_feed = 20
    auto_cleanup = False
    language = 'en'
    compress_news_images = True
    simultaneous_downloads = 20
    use_embedded_content = False
    recursions = 0

    conversion_options = {
        'comment': description, 'tags': category, 'publisher': publisher, 'language': language
    }

    feeds = [
        (u'NME News', u'http://www.nme.com/rss/news'),
        (u'Reviews', u'http://www.nme.com/rss/reviews'),
        (u'Blogs', u'http://www.nme.com/rss/blogs'),
    ]

    keep_only_tags = [
        dict(name='div', attrs={'id': 'content'}),
    ]

    remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan',
                         'valign', 'vspace', 'hspace', 'alt', 'width', 'height']

    remove_tags = [
        dict(name='meta'),
        dict(name='span', attrs={'class': 'article_info'}),
        dict(name='div', attrs={'class': 'breadcrumbs'}),
        dict(name='div', attrs={'class': 'mugshot'}),
        dict(name='div', attrs={'class': 'header'}),
        dict(name='div', attrs={'class': re.compile(
            'youtube.*', re.IGNORECASE)}),
        dict(name='div', attrs={'class': re.compile(
            'socialbuttons.*', re.IGNORECASE)}),
        dict(name='div', attrs={'class': 'clear_both'}),
        dict(name='div', attrs={'class': re.compile(
            'headline.*', re.IGNORECASE)}),
        dict(name='div', attrs={'class': 'member-signedout'}),
        dict(name='div', attrs={'class': re.compile(
            'prev_next.*', re.IGNORECASE)}),
        dict(name='div', attrs={'class': re.compile(
            'article_related.*', re.IGNORECASE)}),
        dict(name='div', attrs={'class': re.compile(
            'feature_bar.*', re.IGNORECASE)}),
        dict(name='div', attrs={'class': re.compile('ebay.*', re.IGNORECASE)}),
        dict(name='div', attrs={'id': re.compile(
            'morenews.*', re.IGNORECASE)}),
        dict(name='div', attrs={'id': re.compile(
            'ticketspopup.*', re.IGNORECASE)}),
        dict(name='div', attrs={'id': re.compile(
            'ratemy_logprompt.*', re.IGNORECASE)}),
        dict(name='div', attrs={'id': re.compile(
            'related_artist.*', re.IGNORECASE)}),
        dict(name='img', attrs={'class': re.compile(
            'video_play_large.*', re.IGNORECASE)}),
        dict(name='ul', attrs={'class': re.compile(
            'prev_next.*', re.IGNORECASE)}),
        dict(name='ul', attrs={'class': re.compile(
            'nme_store.*', re.IGNORECASE)}),
        dict(name='p', attrs={'class': re.compile('top', re.IGNORECASE)}),
        dict(name='table', attrs={
             'class': re.compile('tickets.*', re.IGNORECASE)}),
    ]

    masthead_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg'

    def get_cover_url(self):
        magazine_page_raw = self.index_to_soup(
            'http://www.nme.com/magazine', raw=True)
        magazine_page_raw = re.sub(
            r'<script\b.+?</script>', '', magazine_page_raw, flags=re.DOTALL | re.IGNORECASE)
        magazine_page_raw = re.sub(
            r'\!\[if ', '!--[if ', magazine_page_raw, flags=re.DOTALL | re.IGNORECASE)
        magazine_page = self.index_to_soup(magazine_page_raw)
        cov = magazine_page.find('img', attrs={'class': 'magcover'})

        cov2 = str(cov['src'])

        br = browser()
        br.set_handle_redirect(False)
        try:
            br.open_novisit(cov2)
            cover_url = str(cov2)
        except:
            cover_url = 'http://default.media.ipcdigital.co.uk/300/000001014/e1ab_orh100000w300/NME-logo.jpg'
        return cover_url

    def preprocess_raw_html(self, raw_html, url):
        '''
        Need this for a bug on site that prevents blogg post being parsed correctly
        '''
        raw_html = re.sub(r'\!\[if ', '!--[if ', raw_html,
                          flags=re.DOTALL | re.IGNORECASE)

        return raw_html

    def preprocess_html(self, soup):
        youtube_regex = re.compile(
            r'(?:youtube\.com/(?:[^/]+/.+/|(?:v|e(?:mbed)?)/|.*[?&]v=)|youtu\.be/)(?P<id>[^"&?/ ]{11})', re.DOTALL | re.IGNORECASE)
        instagram_regex = re.compile(
            r'.*?instagram.*?', re.DOTALL | re.IGNORECASE)
        twitter_regex = re.compile(r'.*?twitter.*?', re.DOTALL | re.IGNORECASE)
        visualise_regex = re.compile(
            r'.*?visualise.*?', re.DOTALL | re.IGNORECASE)
        soundcloud_regex = re.compile(
            r'(?P<url>.*?(w|api)\.soundcloud.*?com\/(tracks|playlists)\/\d{8,9})', re.DOTALL | re.IGNORECASE)
        dailymotion_regex = re.compile(
            r'.*?dailymotion.*?', re.DOTALL | re.IGNORECASE)
        spotify_regex = re.compile(r'.*?spotify.*?', re.DOTALL | re.IGNORECASE)
        vine_regex = re.compile(r'.*?vine.*?', re.DOTALL | re.IGNORECASE)
        doubleHtmlEntities = re.compile(
            ur'(&amp;)(?P<e>[\d\w\#]*;)', re.DOTALL | re.IGNORECASE | re.UNICODE)
        for iframe in soup.findAll('iframe'):
            if iframe.has_key('src') and youtube_regex.search(iframe['src']) is not None:  # noqa
                pq = Tag(soup, 'blockquote')
                br = Tag(soup, 'br')
                pq.insert(0, '[ YouTube ]')
                pq.insert(1, br)
                m = youtube_regex.search(iframe['src'])
                if m.group('id') is not None:
                    imgTag = Tag(soup, 'img', [
                                 ('src', 'http://img.youtube.com/vi/' + m.group('id') + '/0.jpg')])
                    pq.insert(len(pq.contents), imgTag)
                pq.insert(len(pq.contents), iframe['src'])
                iframe.replaceWith(pq)
            elif iframe.has_key('src') and soundcloud_regex.search(iframe['src']) is not None:  # noqa
                m = soundcloud_regex.search(iframe['src'])
                pq = Tag(soup, 'blockquote')
                br = Tag(soup, 'br')
                pq.insert(0, '[ SoundCloud ]')
                pq.insert(1, br)
                pq.insert(2, m.group('url'))
                iframe.replaceWith(pq)
            elif iframe.has_key('src') and dailymotion_regex.search(iframe['src']) is not None:  # noqa
                pq = Tag(soup, 'blockquote')
                br = Tag(soup, 'br')
                pq.insert(0, '[ DailyMotion ]')
                pq.insert(1, br)
                imgUrl = self.get_dailymotion_pic(iframe['src'])
                if imgUrl is not None:
                    imgTag = Tag(soup, 'img', [('src', imgUrl)])
                    pq.insert(len(pq.contents), imgTag)
                pq.insert(len(pq.contents), iframe['src'])
                iframe.replaceWith(pq)
            elif iframe.has_key('src') and spotify_regex.search(iframe['src']) is not None:  # noqa
                pq = Tag(soup, 'blockquote')
                br = Tag(soup, 'br')
                pq.insert(0, '[ Spotify ]')
                pq.insert(1, br)
                imgUrl = self.get_spotify_pic(iframe['src'])
                if imgUrl is not None:
                    imgTag = Tag(soup, 'img', [('src', imgUrl)])
                    pq.insert(len(pq.contents), imgTag)
                pq.insert(len(pq.contents), iframe['src'])
                iframe.replaceWith(pq)
            elif iframe.has_key('src') and vine_regex.search(iframe['src']) is not None:  # noqa
                pq = Tag(soup, 'blockquote')
                br = Tag(soup, 'br')
                pq.insert(0, '[ Vine ]')
                pq.insert(1, br)
                imgUrl = self.get_vine_pic(iframe['src'])
                if imgUrl is not None:
                    imgTag = Tag(soup, 'img', [('src', imgUrl)])
                    pq.insert(len(pq.contents), imgTag)
                pq.insert(len(pq.contents), iframe['src'])
                iframe.replaceWith(pq)
            elif iframe.has_key('src') and visualise_regex.search(iframe['src']) is not None:  # noqa
                imgUrl = self.get_visualise_pic(iframe['src'])
                if imgUrl is not None:
                    imgTag = Tag(soup, 'img', [('src', imgUrl)])
                    iframe.replaceWith(imgTag)
        for blockquote in soup.findAll('blockquote'):
            if blockquote.has_key('class') and twitter_regex.search(blockquote['class']) is not None:  # noqa
                pq = Tag(soup, 'blockquote')
                br = Tag(soup, 'br')
                pq.insert(0, '[ Twitter ]')
                pq.insert(len(pq.contents), br)
                match = re.search(
                    "(?P<url>pic\.twitter[^\s<]+)", str(blockquote))
                if match is not None:
                    img = self.get_twitter_pic(str(match.group("url")))
                    if img is not None:
                        pq.insert(len(pq.contents), img)
                for p in blockquote.findAll(name='p'):
                    x = 0
                    plen = len(p.contents)
                    while True:
                        c = len(pq.contents)
                        if p.contents[x].string is not None:
                            pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
                                2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE))
                        else:
                            pq.insert(c, p.contents[x].content)
                        x += 1
                        if x == plen:
                            break
                    br = Tag(soup, 'br')
                    pq.insert(len(pq.contents), br)
                    p.extract()
                if len(blockquote.contents) > 0:
                    x = 0
                    xlen = len(blockquote.contents)
                    while True:
                        c = len(pq.contents)
                        if blockquote.contents[x].string is not None:
                            pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
                                2), str(blockquote.contents[x].string), re.IGNORECASE | re.UNICODE))
                        else:
                            pq.insert(c, blockquote.contents[x].content)
                        x += 1
                        if x == xlen:
                            break
                blockquote.replaceWith(pq)
            elif blockquote.has_key('class') and instagram_regex.search(blockquote['class']) is not None:  # noqa
                pq = Tag(soup, 'blockquote')
                br = Tag(soup, 'br')
                pq.insert(0, '[ Instagram ]')
                pq.insert(1, br)
                a = blockquote.find(name='a', attrs={'href': instagram_regex})
                imgUrl = None
                if a is not None:
                    imgUrl = self.get_instagram_pic(str(a['href']))
                if imgUrl is not None:
                    img = Tag(soup, 'img', [('src', imgUrl)])
                    pq.insert(len(pq.contents), img)
                for p in blockquote.findAll(name='p'):
                    x = 0
                    plen = len(p.contents)
                    while x < plen:
                        c = len(pq.contents)
                        if p.contents[x].string is not None:
                            pq.insert(c, re.sub(doubleHtmlEntities, lambda m: '&' + m.group(
                                2), str(p.contents[x].string), re.IGNORECASE | re.UNICODE))
                        # else:
                            # pq.insert(c, p.contents[x].content)
                        x += 1
                    br = Tag(soup, 'br')
                    c = len(pq.contents)
                    pq.insert(c, br)
                blockquote.replaceWith(pq)
        for alink in soup.findAll('a'):
            if alink.string is not None:
                tstr = alink.string
                alink.replaceWith(tstr)
            elif alink.img is not None:
                tstr = alink.img
                alink.replaceWith(tstr)
            elif alink.span is not None:
                tstr = alink.span
                alink.replaceWith(tstr)
        return soup

    def get_visualise_pic(self, url):
        returnValue = None
        try:
            raw = self.browser.open(url).read()
        except:
            print '404: ' + url
            return returnValue
        bs = BeautifulSoup(raw)
        imgRaw = bs.find(name='meta', attrs={'property': 'og:image'})
        if imgRaw is not None:
            returnValue = str(imgRaw['content'])
        return returnValue

    def get_twitter_pic(self, url):
        returnValue = None
        try:
            raw = self.browser.open('https://' + url).read()
        except:
            print '404: ' + url
            return returnValue
        bs = BeautifulSoup(raw)
        refresh = bs.find('meta', {'http-equiv': 'refresh'})
        if refresh is not None:
            content = refresh.get('content').partition('=')[2]
            try:
                raw = self.browser.open(content).read()
            except:
                print '404: ' + url
                return returnValue
            bs = BeautifulSoup(raw)
        img = bs.find(name='img', attrs={
                      'alt': re.compile('.*permalink.*', re.IGNORECASE)})
        if img is not None:
            returnValue = img
        return returnValue

    def get_soundcloud_pic(self, url):
        # content loaded via javascript and require an login and/or registered application identification
        # returnValue = None
        # raw = self.browser.open(soundcloudUrl + '&visual=true').read()
        # bs = BeautifulSoup(raw)
        # imgRaw = bs.find(name='div', attrs={'style':re.compile(r'backgroud-image:*?',re.IGNORECASE)})
        # if imgRaw is not None:
            # returnValue = str(imgRaw['style'])
        return None  # returnValue

    def get_instagram_pic(self, url):
        returnValue = None
        try:
            raw = self.browser.open(url).read()
        except:
            print '404: ' + url
            return returnValue
        m = re.search('\"display_src\":\"(?P<url>http[s]?:.*?)\"', str(raw))
        if m is not None:
            returnValue = re.sub(r'\\', '', m.group(
                "url"), flags=re.DOTALL | re.IGNORECASE)
        return returnValue

    def get_dailymotion_pic(self, url):
        returnValue = None
        try:
            raw = self.browser.open(url).read()
        except:
            print '404: ' + url
            return returnValue
        m = re.search('("thumbnail_url\"\:\")(?P<url>http.*?)(\")', str(raw))
        if m is not None:
            returnValue = re.sub(r'\\', '', m.group(
                "url"), flags=re.DOTALL | re.IGNORECASE)
        return returnValue

    def get_spotify_pic(self, url):
        returnValue = None
        try:
            raw = self.browser.open(url).read()
        except:
            print '404: ' + url
            return returnValue
        m = re.search('data-ca=\"(?P<url>.*?)\"', str(raw))
        if m is not None:
            returnValue = m.group("url")
        return returnValue

    def get_vine_pic(self, url):
        returnValue = None
        try:
            raw = self.browser.open(url).read()
        except:
            print '404: ' + url
            return returnValue
        m = re.search('"thumbnail.*?src=\"(?P<url>.*?)\"', str(raw))
        if m is not None:
            returnValue = m.group("url")
        return returnValue

    preprocess_regexps = [
        (re.compile(r'<script\b.+?</script>', re.DOTALL | re.IGNORECASE), lambda h1: ''),
        (re.compile(r'<a.* id="buy-tickets-button".*</a>', re.IGNORECASE), lambda h2: ''),
        (re.compile(r'<a.* class="gallery.*</a>', re.IGNORECASE), lambda h2: ''),
    ]

    extra_css = '''
                    h1 h2 {
                        font-family:Arial,Helvetica,sans-serif;
                        font-weight:bold;font-size:large;
                    }
                    h3 {
                        font-family:Arial,Helvetica,sans-serif;
                        font-weight:normal;
                        font-size:small;
                        font-style:italic;
                        display:inline;
                    }
                    body {
                        font-family:Helvetica,Arial,sans-serif;
                        font-size:small;
                    }
                    blockquote {
                        font-family:"Courier New",
                        Courier, monospace;
                        font-size:90%;
                    }
                    img {
                        display:block;
                    }
                    .date{
                        font-style:italic;
                        font-weight:normal;
                    }
                    .article_header>p:not(.date){
                        font-weight:bold;
                    }
                '''