calibre/recipes/metro_news_nl.recipe

# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import print_function
from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.utils.magick import Image

''' Version 1.2, updated cover image to match the changed website.
 added info date on title
 version 1.4 Updated tags, delay and added autoclean 22-09-2011
 version 1.5 Changes due to changes in site
 version 1.6 Added css, removed auto cleanup, added buitenland section, added use_embedded_content, added remove_attributes
    Added some processing on pictures
    Removed links in html
    Removed extre white characters
    changed handling of self closing span
 Version 1.7 11-11-2011 Changed oldest_article back to 1.5
    changed è into &egrave;
    updated remove tags
    removed keep_only tags
 Version 1.8 26-11-2022
   added remove tag: article-slideshow
 Version 1.9 31-1-2012
   removed some left debug settings
      extended timeout from 2 to 10
      changed oldest article from 10 to 1.2
      changed max articles from 15 to 25
 Version 1.9.1 18-04-2012
    removed some debug settings
    updated code to match new metro-layout
 Version 1.9.2 24-04-2012
    updated code to match new metro-layout
 Version 1.9.3 25-04-2012
    Changed a lot of custom code into calibre code as the default code of calibre has become much faster since the first version fo this recipe
    Added new feeds
    Updated css
    Changed order of regex to speedup proces
 Version 1.9.3 23-05-2012
    Updated Cover image
 Version 1.9.4 19-04-2013
    Added regex filter for mailto
    Updated for new layout of metro-site
 Version 1.9.5 28-05-2013
    Added some extra id's and classes to remove
'''


class AdvancedUserRecipe1306097511(BasicNewsRecipe):
    title = u'Metro Nieuws NL'
    oldest_article = 1.2
    max_articles_per_feed = 25
    __author__ = u'DrMerry'
    description = u'Metro Nederland v1.9.5 2013-05-28, Download nieuws van de Nederlandse editie van de krant Metro'
    language = u'nl'
    simultaneous_downloads = 5
    masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
    timeout = 10
    center_navbar = True
    timefmt = ' [%A, %d %b %Y]'
    no_stylesheets = True
    remove_javascript = True
    remove_empty_feeds = True
    cover_url = 'http://www.readmetro.com/en/holland/metro-holland/image/large/last/'
    publication_type = 'newspaper'
    encoding = 'utf-8'
    remove_attributes = ['style', 'font', 'width', 'height',
                         'itemtype', 'itemprop', 'itemscope']  # , 'href']
    use_embedded_content = False
    extra_css = 'body{font-size:1em;padding:5px 0}body,a,h2{background-color:#fff;text-decoration:none;color:#000}#date,div.byline,p.article-image-caption .credits,.calibrenavbar,.calibre5{font-size:.5em}.article-box-fact.module-title,#date,div.byline{clear:both}.article-box-fact{font-size:0.7em}.article-box-fact.module-title{margin:8px 0; font-size:0.8em}h2{font-size:1em}h1.title{font-size:1.4em}h1.title,.article-body p,div.article-image-caption-2column,div.article-image-caption-3column,#date,div.byline{margin-bottom:.6em}div.article-box-fact div.subtitle,.article-box-fact.module-title,h1.title,p.article-image-caption{font-weight:700}div.column-1-3{margin-left:19px}div.column-1-2{display:inline}div.column-1-2,div.column-1-3{margin-right:7px}p.article-image-caption{font-size:.6em;margin-top:5px}p.article-image-caption,#date,div.byline{color:#616262}p.article-image-caption .credits{font-style:italic}div.article-image-caption{width:246px}div.article-image-caption-2column{width:373px}div.column-3{background-color:#eee;float:right;width:50%}div.column-3 module-title{border:1px solid #aaa}div.article-box-fact div.subtitle,.article-box-fact.module-title{color:#24763b}div.byline{border-top:2px solid #24763b}div.column-3,img,div.column-3,p.small,div.article-image-caption{margin:.5em}img,p.small,.column1,h2,.calibre5,.calibrenavbar{border:0;padding:0}.column1,h1,h2,.calibrenavbar{margin:0}'  # noqa

    preprocess_regexps = [
        (re.compile(r'(&nbsp;|\s|<img[^>]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)',
                    re.DOTALL | re.IGNORECASE), lambda match: ' '),
    ]

    remove_tags_before = dict(id='subwrapper')
    remove_tags_after = dict(
        name='div', attrs={'class': ['body-area', 'article-main-area']})
# name='div', attrs={'class':['subwrapper']})]
# 'column-1-3','gallery-text']})]#id='share-and-byline')]

    filter_regexps = [r'mailto:.*']

    remove_tags = [
        dict(name=['iframe', 'script', 'noscript', 'style']),
        dict(name='div', attrs={'class': ['fact-related-box', 'aside clearfix', 'aside clearfix middle-col-line', 'comments', 'share-tools', 'article-right-column', 'column-4-5', 'column-1-5', 'ad-msg', 'col-179 ', 'col-373 ', 'clear', 'ad', 'navigation', re.compile('share-tools(-top)?'), 'tools', 'metroCommentFormWrap', 'article-tools-below-title', 'related-links', 'padding-top-15', re.compile('^promo.*?$'), 'teaser-component', re.compile('fb(-comments|_iframe_widget)'), 'promos', 'header-links', 'promo-2']}),  # noqa
        dict(id=['super-carousel', 'article-2', 'googleads', 'column-1-5-bottom', 'column-4-5', re.compile('^ad(\\d+|adcomp.*?)?$'), 'adadcomp-4', 'margin-5', 'sidebar', re.compile('^article-\\d'), 'comments', 'gallery-1', 'sharez_container', 'ts-container', 'topshares', 'ts-title']),  # noqa
        dict(name='a', attrs={'name': 'comments'}),
        dict(name='img', attrs={'class': 'top-line',
                                'title': 'volledig scherm'}),
        dict(attrs={'style': re.compile('^(.*(display\\s?:\\s?none|img-mask|white)\\s?;?.*)$'), 'title': 'volledig scherm'})]

    '''removed by before/after:
        id:
        column-1-5-top,'hidden_div','footer',
        class:
        'header',re.compile('^footer-[a-zA-Z0-9]+$),'header-links',
        '''

    def preprocess_html(self, soup):
        myProcess = MerryProcess()
        myProcess.moveTitleAndAuthor(soup)
        myProcess.removeUnwantedTags(soup)
        return soup

    def postprocess_html(self, soup, first):
        myProcess = MerryProcess()
        myProcess.optimizeLayout(soup)
        return soup

    feeds = [
        (u'Binnenland', u'http://www.metronieuws.nl/rss.xml?c=1277377288-3'),
        (u'Economie', u'http://www.metronieuws.nl/rss.xml?c=1278070988-0'),
        (u'Den Haag', u'http://www.metronieuws.nl/rss.xml?c=1289013337-3'),
        (u'Rotterdam', u'http://www.metronieuws.nl/rss.xml?c=1289013337-2'),
        (u'Amsterdam', u'http://www.metronieuws.nl/rss.xml?c=1289013337-1'),
        (u'Buitenland', u'http://www.metronieuws.nl/rss.xml?c=1277377288-4'),
        (u'Columns', u'http://www.metronieuws.nl/rss.xml?c=1277377288-17'),
        (u'Entertainment', u'http://www.metronieuws.nl/rss.xml?c=1277377288-2'),
        (u'Strips', u'http://www.metronieuws.nl/rss.xml?c=1325037714-0'),
        (u'Tech', u'http://www.metronieuws.nl/rss.xml?c=1283166782-12'),
        (u'Familie', u'http://www.metronieuws.nl/rss.xml?c=1283166782-9'),
        (u'Blogs', u'http://www.metronieuws.nl/rss.xml?c=1295586825-6'),
        (u'Reizen', u'http://www.metronieuws.nl/rss.xml?c=1277377288-13'),
        (u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
        (u'Wetenschap', u'http://www.metronieuws.nl/rss.xml?c=1303088437-0'),
        (u'Planeet', u'http://www.metronieuws.nl/rss.xml?c=1277377288-14'),
        (u'Gezondheid', u'http://www.metronieuws.nl/rss.xml?c=1277377288-15'),
        (u'Sport', u'http://www.metronieuws.nl/rss.xml?c=1277377288-12')
    ]


class MerryPreProcess():

    def optimizePicture(self, soup):
        for tag in soup.findAll('img', src=True):
            try:
                iurl = tag['src']
                img = Image()
                img.open(iurl)
                img.trim(0)
                img.save(iurl)
            except:
                print('\n!!image optimize failed!!\n')
                continue
        return soup


class MerryExtract():

    def safeRemovePart(self, killingSoup, soupIsArray):
        if killingSoup and killingSoup is not None:
            try:
                if soupIsArray is True:
                    for killer in killingSoup:
                        killer.extract()
                else:
                    killingSoup.extract()
            except:
                return False
        else:
            return False
        return killingSoup


class MerryProcess(object):
    myKiller = MerryExtract()
    myPrepare = MerryPreProcess()

    def optimizeLayout(self, soup):
        self.myPrepare.optimizePicture(soup)
        return soup

    def insertFacts(self, soup):
        thefactpart = re.compile('^article-box-fact.*$')
        allfacts = soup.findAll('div', {'class': thefactpart})
        if allfacts and allfacts is not None:
            allfactsparent = soup.find('div', {'class': thefactpart}).parent
            for part in allfactsparent:
                if part not in allfacts:
                    self.myKiller.safeRemovePart(part, True)
        articlefacts = soup.find('div', {'class': 'article-box-fact column'})
        if (articlefacts and articlefacts is not None):
            try:
                contenttag = soup.find('div', {'class': 'article-body'})
                foundrighttag = False
                if contenttag and contenttag is not None:
                    foundrighttag = True
                if foundrighttag is True:
                    contenttag.insert(0, allfactsparent)
            except:
                pass
        return soup

    def moveTitleAndAuthor(self, soup):
        moveitem = soup.h1
        pubdate = soup.find(id="date")
        if moveitem and moveitem is not None and pubdate and pubdate is not None:
            try:
                pubdate.parent.insert(0, moveitem)
            except:
                print('\n!!error in moving title!!\n')
                pass
        moveitem = None
        moveitem = soup.find('div', {'class': 'byline'})
        if moveitem and moveitem is not None:
            try:
                moveitem.parent.parent.insert(-1, moveitem)
            except:
                print('\n!!error in moving byline!!\n')
                pass
        return soup

    def removeUnwantedTags(self, soup):
        self.insertFacts(soup)
        self.removeEmptyTags(soup)
        # at end to keep author
        self.removeArrayOfTags(soup.findAll(
            attrs={'class': 'share-tools-bottom'}))
        return soup

    def removeArrayOfTags(self, souparray):
        return self.myKiller.safeRemovePart(souparray, True)

    def removeEmptyTags(self, soup, run=0):
        emptymatches = re.compile('^[&nbsp;\\s\n\r\t ]*$')
        emptytags = soup.findAll(lambda tag: tag.find(True) is None and (
            tag.string is None or tag.string.strip() == "" or tag.string.strip() == emptymatches) and not tag.isSelfClosing)
        if emptytags and not (emptytags is None or emptytags == []):
            self.removeArrayOfTags(emptytags)
            # recursive in case removing empty tag creates new empty tag
            self.removeEmptyTags(soup, run=run)
        return soup