calibre/recipes/metro_news_nl.recipe

# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.utils.magick import Image
from BeautifulSoup import BeautifulSoup

''' Version 1.2, updated cover image to match the changed website.
 added info date on title
 version 1.4 Updated tags, delay and added autoclean 22-09-2011
 version 1.5 Changes due to changes in site
 version 1.6 Added css, removed auto cleanup, added buitenland section, added use_embedded_content, added remove_attributes
    Added some processing on pictures
    Removed links in html
    Removed extre white characters
    changed handling of self closing span
 Version 1.7 11-11-2011 Changed oldest_article back to 1.5
    changed è into &egrave;
    updated remove tags
    removed keep_only tags
 Version 1.8 26-11-2022
   added remove tag: article-slideshow
 Version 1.9 31-1-2012
   removed some left debug settings
      extended timeout from 2 to 10
      changed oldest article from 10 to 1.2
      changed max articles from 15 to 25
 Version 1.9.1 18-04-2012
    removed some debug settings
    updated code to match new metro-layout
 Version 1.9.2 24-04-2012
    updated code to match new metro-layout
 Version 1.9.3 25-04-2012
    Changed a lot of custom code into calibre code as the default code of calibre has become much faster since the first version fo this recipe
    Added new feeds
    Updated css
    Changed order of regex to speedup proces
 Version 1.9.3 23-05-2012
    Updated Cover image
 Version 1.9.4 19-04-2013
    Added regex filter for mailto
    Updated for new layout of metro-site
 Version 1.9.5 28-05-2013
    Added some extra id's and classes to remove
'''

class AdvancedUserRecipe1306097511(BasicNewsRecipe):
    title = u'Metro Nieuws NL'
    oldest_article = 1.2
    max_articles_per_feed = 25
    __author__  = u'DrMerry'
    description = u'Metro Nederland v1.9.5 2013-05-28, Download nieuws van de Nederlandse editie van de krant Metro'
    language = u'nl'
    simultaneous_downloads = 5
    masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
    timeout = 10
    center_navbar = True
    timefmt = ' [%A, %d %b %Y]'
    no_stylesheets = True
    remove_javascript = True
    remove_empty_feeds = True
    cover_url = 'http://www.readmetro.com/en/holland/metro-holland/image/large/last/'
    publication_type = 'newspaper'
    encoding = 'utf-8'
    remove_attributes = ['style', 'font', 'width', 'height', 'itemtype', 'itemprop', 'itemscope']#, 'href']
    use_embedded_content = False
    extra_css = 'body{font-size:1em;padding:5px 0}body,a,h2{background-color:#fff;text-decoration:none;color:#000}#date,div.byline,p.article-image-caption .credits,.calibrenavbar,.calibre5{font-size:.5em}.article-box-fact.module-title,#date,div.byline{clear:both}.article-box-fact{font-size:0.7em}.article-box-fact.module-title{margin:8px 0; font-size:0.8em}h2{font-size:1em}h1.title{font-size:1.4em}h1.title,.article-body p,div.article-image-caption-2column,div.article-image-caption-3column,#date,div.byline{margin-bottom:.6em}div.article-box-fact div.subtitle,.article-box-fact.module-title,h1.title,p.article-image-caption{font-weight:700}div.column-1-3{margin-left:19px}div.column-1-2{display:inline}div.column-1-2,div.column-1-3{margin-right:7px}p.article-image-caption{font-size:.6em;margin-top:5px}p.article-image-caption,#date,div.byline{color:#616262}p.article-image-caption .credits{font-style:italic}div.article-image-caption{width:246px}div.article-image-caption-2column{width:373px}div.column-3{background-color:#eee;float:right;width:50%}div.column-3 module-title{border:1px solid #aaa}div.article-box-fact div.subtitle,.article-box-fact.module-title{color:#24763b}div.byline{border-top:2px solid #24763b}div.column-3,img,div.column-3,p.small,div.article-image-caption{margin:.5em}img,p.small,.column1,h2,.calibre5,.calibrenavbar{border:0;padding:0}.column1,h1,h2,.calibrenavbar{margin:0}'


    preprocess_regexps = [
        (re.compile(r'(&nbsp;|\s|<img[^>]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)', re.DOTALL|re.IGNORECASE),lambda match: ' '),
        #(re.compile(r'(&nbsp;|\s)+', re.DOTALL|re.IGNORECASE),lambda match:' '),
        #(re.compile(r'<(a |/a)[^>]*>', re.DOTALL|re.IGNORECASE),lambda match:'')
        #(re.compile('(</?)h2', re.DOTALL|re.IGNORECASE),lambda match:'\1em')
        ]

    remove_tags_before= dict(id='subwrapper')
    remove_tags_after = dict(name='div', attrs={'class':['body-area','article-main-area']})
#name='div', attrs={'class':['subwrapper']})]
#'column-1-3','gallery-text']})]#id='share-and-byline')]

    filter_regexps = [r'mailto:.*']

    remove_tags = [
        dict(name=['iframe','script','noscript','style']),
        dict(name='div', attrs={'class':['fact-related-box','aside clearfix','aside clearfix middle-col-line','comments','share-tools','article-right-column','column-4-5','column-1-5','ad-msg','col-179 ','col-373 ','clear','ad','navigation',re.compile('share-tools(-top)?'),'tools','metroCommentFormWrap','article-tools-below-title','related-links','padding-top-15',re.compile('^promo.*?$'),'teaser-component',re.compile('fb(-comments|_iframe_widget)'),'promos','header-links','promo-2']}),
        dict(id=['super-carousel','article-2','googleads','column-1-5-bottom','column-4-5',re.compile('^ad(\d+|adcomp.*?)?$'),'adadcomp-4','margin-5','sidebar',re.compile('^article-\d'),'comments','gallery-1','sharez_container','ts-container','topshares','ts-title']),
        dict(name='a', attrs={'name':'comments'}),
        #dict(name='div', attrs={'data-href'}),
        dict(name='img', attrs={'class':'top-line','title':'volledig scherm'}),
        dict(attrs={'style':re.compile('^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$'),'title':'volledig scherm'})]

    '''removed by before/after:
        id:
        column-1-5-top,'hidden_div','footer',
        class:
        'header',re.compile('^footer-[a-zA-Z0-9]+$),'header-links',
        '''
    def preprocess_html(self, soup):
        myProcess = MerryProcess()
        myProcess.moveTitleAndAuthor(soup)
        myProcess.removeUnwantedTags(soup)
        return soup

    def postprocess_html(self, soup, first):
        myProcess = MerryProcess()
        myProcess.optimizeLayout(soup)
        return soup

    feeds = [
        (u'Binnenland', u'http://www.metronieuws.nl/rss.xml?c=1277377288-3'),
        (u'Economie', u'http://www.metronieuws.nl/rss.xml?c=1278070988-0'),
        (u'Den Haag', u'http://www.metronieuws.nl/rss.xml?c=1289013337-3'),
        (u'Rotterdam', u'http://www.metronieuws.nl/rss.xml?c=1289013337-2'),
        (u'Amsterdam', u'http://www.metronieuws.nl/rss.xml?c=1289013337-1'),
        (u'Buitenland', u'http://www.metronieuws.nl/rss.xml?c=1277377288-4'),
        (u'Columns', u'http://www.metronieuws.nl/rss.xml?c=1277377288-17'),
        (u'Entertainment', u'http://www.metronieuws.nl/rss.xml?c=1277377288-2'),
        (u'Strips',u'http://www.metronieuws.nl/rss.xml?c=1325037714-0'),
        (u'Tech', u'http://www.metronieuws.nl/rss.xml?c=1283166782-12'),
        (u'Familie', u'http://www.metronieuws.nl/rss.xml?c=1283166782-9'),
        (u'Blogs', u'http://www.metronieuws.nl/rss.xml?c=1295586825-6'),
        (u'Reizen', u'http://www.metronieuws.nl/rss.xml?c=1277377288-13'),
        (u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
        (u'Wetenschap',u'http://www.metronieuws.nl/rss.xml?c=1303088437-0'),
        (u'Planeet',u'http://www.metronieuws.nl/rss.xml?c=1277377288-14'),
        (u'Gezondheid',u'http://www.metronieuws.nl/rss.xml?c=1277377288-15'),
        (u'Sport', u'http://www.metronieuws.nl/rss.xml?c=1277377288-12')
        ]

class MerryPreProcess():
    def optimizePicture(self,soup):
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            try:
                iurl = tag['src']
                img = Image()
                img.open(iurl)
                img.trim(0)
                img.save(iurl)
            except:
                print '\n!!image optimize failed!!\n'
                continue
        return soup

class MerryExtract():
    def safeRemovePart(self, killingSoup, soupIsArray):
        if killingSoup and not killingSoup == None:
            try:
                if soupIsArray == True:
                    for killer in killingSoup:
                        killer.extract()
                else:
                    killingSoup.extract()
            except:
                return False
        else:
            return False
        return killingSoup

class MerryProcess(BeautifulSoup):
    myKiller = MerryExtract()
    myPrepare = MerryPreProcess()

    def optimizeLayout(self,soup):
        self.myPrepare.optimizePicture(soup)
        return soup

    def insertFacts(self, soup):
        thefactpart = re.compile('^article-box-fact.*$')
        allfacts = soup.findAll('div', {'class':thefactpart})
        if allfacts and not allfacts == None:
            allfactsparent = soup.find('div', {'class':thefactpart}).parent
            for part in allfactsparent:
                if not part in allfacts:
                    self.myKiller.safeRemovePart(part, True)
        articlefacts = soup.find('div', {'class':'article-box-fact column'})
        if (articlefacts and not articlefacts==None):
          try:
            contenttag = soup.find('div', {'class':'article-body'})
            foundrighttag = False
            if contenttag and not contenttag == None:
                foundrighttag = True
            if foundrighttag == True:
                contenttag.insert(0, allfactsparent)
          except:
              pass
        return soup

    def moveTitleAndAuthor(self, soup):
        moveitem = soup.h1
        pubdate = soup.find(id="date")
        if moveitem and not moveitem == None and pubdate and not pubdate == None:
            try:
                pubdate.parent.insert(0, moveitem)
            except:
                print '\n!!error in moving title!!\n'
                pass
        moveitem = None
        moveitem = soup.find('div', {'class':'byline'})
        if moveitem and not moveitem == None:
            try:
                moveitem.parent.parent.insert(-1, moveitem)
            except:
                print '\n!!error in moving byline!!\n'
                pass
        return soup

    def removeUnwantedTags(self,soup):
        self.insertFacts(soup)
        self.removeEmptyTags(soup)
        self.removeArrayOfTags(soup.findAll(attrs={'class': 'share-tools-bottom'})) # at end to keep author
        return soup

    def removeArrayOfTags(self,souparray):
        return self.myKiller.safeRemovePart(souparray, True)

    def removeEmptyTags(self,soup,run=0):
        emptymatches = re.compile('^[&nbsp;\s\n\r\t ]*$')
        emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
        if emptytags and not (emptytags == None or emptytags == []):
            self.removeArrayOfTags(emptytags)
            #recursive in case removing empty tag creates new empty tag
            self.removeEmptyTags(soup, run=run)
        return soup