calibre/recipes/metro_news_nl.recipe

from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.utils.magick import Image


''' Version 1.2, updated cover image to match the changed website.
 added info date on title
 version 1.4 Updated tags, delay and added autoclean 22-09-2011
 version 1.5 Changes due to changes in site
 version 1.6 Added css, removed auto cleanup, added buitenland section, added use_embedded_content, added remove_attributes
    Added som processing on pictures
    Removed links in html
    Removed extre white characters
    changed handling of self closing span
 '''

class AdvancedUserRecipe1306097511(BasicNewsRecipe):
    title = u'Metro Nieuws NL'
    oldest_article = 2
    max_articles_per_feed = 100
    __author__     = u'DrMerry'
    description    = u'Metro Nederland'
    language       = u'nl'
    simultaneous_downloads = 5
    #delay          = 1
    #auto_cleanup = True
    #auto_cleanup_keep = '//div[@class="article-image-caption-2column"]/*|//div[@id="date"]/*|//div[@class="article-image-caption-3column"]/*'
    timefmt        = ' [%A, %d %b %Y]'
    no_stylesheets = True
    remove_javascript = True
    remove_empty_feeds = True
    cover_url      = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg'
    publication_type = 'newspaper'
    remove_tags_before = dict(name='div', attrs={'id':'date'})
    remove_tags_after = dict(name='div', attrs={'class':'article-body'})
    encoding              = 'utf-8'
    remove_attributes = ['style', 'font', 'width', 'height']
    use_embedded_content = False
    extra_css = 'body {padding:5px 0px; background:#fff;font-size: 13px;}\
        #date {clear: both;margin-left: 19px;font-size: 11px;font-weight: 300;color: #616262;height: 15px;}\
        .article-box-fact.module-title {clear:both;border-top:1px solid black;border-bottom:4px solid black;padding: 8px 0;color: #24763b;font-family: arial, sans-serif;font-size: 14px;font-weight: bold;}\
        h1.title {color: #000000;font-size: 44px;padding-bottom: 10px;line-height: 1.15;font-weight: 300;} h2.subtitle {font-size: 13px;font-weight: 700;padding-bottom: 10px;}\
        .article-body p{padding-bottom:10px;}div.column-1-3{float: left;display: inline;width: 567px;margin-left: 19px;border-right: 1px solid #CACACA;padding-right: 9px;}\
        div.column-1-2 {float: left;display: inline;width: 373px;padding-right: 7px;border-right: 1px solid #CACACA;}\
        p.article-image-caption {font-size: 12px;font-weight: 300;line-height: 1.4;color: #616262;margin-top: 5px;} \
        p.article-image-caption .credits {font-style: italic;font-size: 10px;}\
        div.article-image-caption {width: 246px;margin-bottom: 5px;margin-left: 10px;}\
        div.article-image-caption-2column {margin-bottom: 10px;width: 373px;} div.article-image-caption-3column {}\
        img {border:0px;} .img-mask {position:absolute;top:0px;left:0px;}'

    keep_only_tags = [dict(name='div', attrs={'class':[ 'article-image-caption-2column', 'article-image-caption-3column', 'article-body', 'article-box-fact']}),
        dict(name='div', attrs={'id':['date']}),
        dict(name='h1', attrs={'class':['title']}),
        dict(name='h2', attrs={'class':['subtitle']})]

    remove_tags = [dict(name='div', attrs={'class':[ 'metroCommentFormWrap',
        'commentForm', 'metroCommentInnerWrap', 'article-slideshow-counter-container', 'article-slideshow-control', 'ad', 'header-links',
        'art-rgt','pluck-app pluck-comm', 'share-and-byline', 'article-tools-below-title', 'col-179 ', 'related-links', 'clear padding-top-15', 'share-tools', 'article-page-auto-pushes', 'footer-edit']}),
        dict(name='div', attrs={'id':['article-2', 'article-4', 'article-1', 'navigation', 'footer', 'header', 'comments', 'sidebar', 'share-and-byline']}),
        dict(name='iframe')]

    preprocess_regexps = [(re.compile(r'(<p>(&nbsp;|\s)*</p>|<a[^>]*>Tweet</a>|<a[^>]*>|</a>|<!--.*?-->)', re.DOTALL|re.IGNORECASE),lambda match: ''),
        (re.compile(r'(&nbsp;|\s\s)+\s*', re.DOTALL|re.IGNORECASE),lambda match: ' '),
        (re.compile(r'([\s>])([^\s>]+)(<span[^>]+) />', re.DOTALL|re.IGNORECASE),
            lambda match: match.group(1) + match.group(3) + '>' + match.group(2) + '</span>'),
        ]

    def postprocess_html(self, soup, first):
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            img = Image()
            img.open(iurl)
        #width, height = img.size
        #print '***img is: ', iurl, '\n****width is: ', width, 'height is: ', height
            img.trim(0)
            img.save(iurl)
            '''
            #width, height = img.size
            #print '***TRIMMED img width is: ', width, 'height is: ', height
            left=0
            top=0
            border_color='#ffffff'
            width, height = img.size
            #print '***retrieved img width is: ', width, 'height is: ', height
            height_correction = 1.17
            canvas = create_canvas(width, height*height_correction,border_color)
            canvas.compose(img, left, top)
            #img = canvas
            canvas.save(iurl)
            #width, height = canvas.size
            #print '***NEW img width is: ', width, 'height is: ', height
            '''
        return soup

    feeds = [
        (u'Binnenland', u'http://www.metronieuws.nl/rss.xml?c=1277377288-3'),
        (u'Economie', u'http://www.metronieuws.nl/rss.xml?c=1278070988-0'),
        (u'Den Haag', u'http://www.metronieuws.nl/rss.xml?c=1289013337-3'),
        (u'Rotterdam', u'http://www.metronieuws.nl/rss.xml?c=1289013337-2'),
        (u'Amsterdam', u'http://www.metronieuws.nl/rss.xml?c=1289013337-1'),
        (u'Buitenland', u'http://www.metronieuws.nl/rss.xml?c=1277377288-4'),
        (u'Columns', u'http://www.metronieuws.nl/rss.xml?c=1277377288-17'),
        (u'Entertainment', u'http://www.metronieuws.nl/rss.xml?c=1277377288-2'),
        (u'Dot', u'http://www.metronieuws.nl/rss.xml?c=1283166782-12'),
        (u'Familie', u'http://www.metronieuws.nl/rss.xml?c=1283166782-9'),
        (u'Blogs', u'http://www.metronieuws.nl/rss.xml?c=1295586825-6'),
        (u'Reizen', u'http://www.metronieuws.nl/rss.xml?c=1277377288-13'),
        (u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
        (u'Sport', u'http://www.metronieuws.nl/rss.xml?c=1277377288-12')
        ]