Fix #889294 (updated Metro NL)

This commit is contained in:
Kovid Goyal 2011-11-12 08:30:19 +05:30
parent 86e5c79180
commit e0a86fcc38

View File

@ -1,3 +1,4 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re import re
from calibre.utils.magick import Image from calibre.utils.magick import Image
@ -8,21 +9,27 @@ from calibre.utils.magick import Image
version 1.4 Updated tags, delay and added autoclean 22-09-2011 version 1.4 Updated tags, delay and added autoclean 22-09-2011
version 1.5 Changes due to changes in site version 1.5 Changes due to changes in site
version 1.6 Added css, removed auto cleanup, added buitenland section, added use_embedded_content, added remove_attributes version 1.6 Added css, removed auto cleanup, added buitenland section, added use_embedded_content, added remove_attributes
Added som processing on pictures Added some processing on pictures
Removed links in html Removed links in html
Removed extre white characters Removed extre white characters
changed handling of self closing span changed handling of self closing span
''' Version 1.7 11-11-2011 Changed oldest_article back to 1.5
changed è into è
updated remove tags
removed keep_only tags
'''
class AdvancedUserRecipe1306097511(BasicNewsRecipe): class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Metro Nieuws NL' title = u'Metro Nieuws NL'
oldest_article = 2 oldest_article = 1.5
max_articles_per_feed = 100 max_articles_per_feed = 100
__author__ = u'DrMerry' __author__ = u'DrMerry'
description = u'Metro Nederland' description = u'Metro Nederland'
language = u'nl' language = u'nl'
simultaneous_downloads = 5 simultaneous_downloads = 5
timeout = 2
#delay = 1 #delay = 1
center_navbar = True
#auto_cleanup = True #auto_cleanup = True
#auto_cleanup_keep = '//div[@class="article-image-caption-2column"]/*|//div[@id="date"]/*|//div[@class="article-image-caption-3column"]/*' #auto_cleanup_keep = '//div[@class="article-image-caption-2column"]/*|//div[@id="date"]/*|//div[@class="article-image-caption-3column"]/*'
timefmt = ' [%A, %d %b %Y]' timefmt = ' [%A, %d %b %Y]'
@ -31,31 +38,32 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
remove_empty_feeds = True remove_empty_feeds = True
cover_url = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg' cover_url = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg'
publication_type = 'newspaper' publication_type = 'newspaper'
remove_tags_before = dict(name='div', attrs={'id':'date'}) remove_tags_before = dict(id='date')
remove_tags_after = dict(name='div', attrs={'class':'article-body'}) remove_tags_after = dict(name='div', attrs={'class':'article-body'})
encoding = 'utf-8' encoding = 'utf-8'
remove_attributes = ['style', 'font', 'width', 'height'] remove_attributes = ['style', 'font', 'width', 'height']
use_embedded_content = False use_embedded_content = False
conversion_options = {
'authors' : 'Metro Nederland',
'author_sort' : 'Metro Nederland',
'publisher' : 'DrMerry/Metro Nederland'
}
extra_css = 'body {padding:5px 0px; background:#fff;font-size: 13px;}\ extra_css = 'body {padding:5px 0px; background:#fff;font-size: 13px;}\
#date {clear: both;margin-left: 19px;font-size: 11px;font-weight: 300;color: #616262;height: 15px;}\ #date {clear: both;margin-left: 19px;font-size: 11px;font-weight: 300;color: #616262;height: 15px;}\
.article-box-fact.module-title {clear:both;border-top:1px solid black;border-bottom:4px solid black;padding: 8px 0;color: #24763b;font-family: arial, sans-serif;font-size: 14px;font-weight: bold;}\ .article-box-fact.module-title {clear:both;padding: 8px 0;color: #24763b;font-family: arial, sans-serif;font-size: 14px;font-weight: bold;}\
h1.title {color: #000000;font-size: 44px;padding-bottom: 10px;line-height: 1.15;font-weight: 300;} h2.subtitle {font-size: 13px;font-weight: 700;padding-bottom: 10px;}\ h1.title {color: #000000;font-size: 44px;padding-bottom: 10px;font-weight: 300;} h2.subtitle {font-size: 13px;font-weight: 700;padding-bottom: 10px;}\
.article-body p{padding-bottom:10px;}div.column-1-3{float: left;display: inline;width: 567px;margin-left: 19px;border-right: 1px solid #CACACA;padding-right: 9px;}\ .article-body p{padding-bottom:10px;}div.column-1-3{margin-left: 19px;padding-right: 9px;}\
div.column-1-2 {float: left;display: inline;width: 373px;padding-right: 7px;border-right: 1px solid #CACACA;}\ div.column-1-2 {display: inline;padding-right: 7px;}\
p.article-image-caption {font-size: 12px;font-weight: 300;line-height: 1.4;color: #616262;margin-top: 5px;} \ p.article-image-caption {font-size: 12px;font-weight: 300;color: #616262;margin-top: 5px;} \
p.article-image-caption .credits {font-style: italic;font-size: 10px;}\ p.article-image-caption .credits {font-style: italic;font-size: 10px;}\
div.article-image-caption {width: 246px;margin-bottom: 5px;margin-left: 10px;}\ div.article-image-caption {width: 246px;margin-bottom: 5px;margin-left: 10px;}\
div.article-image-caption-2column {margin-bottom: 10px;width: 373px;} div.article-image-caption-3column {}\ div.article-image-caption-2column {margin-bottom: 10px;width: 373px;} div.article-image-caption-3column {}\
img {border:0px;} .img-mask {position:absolute;top:0px;left:0px;}' img {border:0px;} .img-mask {position:absolute;top:0px;left:0px;}'
keep_only_tags = [dict(name='div', attrs={'class':[ 'article-image-caption-2column', 'article-image-caption-3column', 'article-body', 'article-box-fact']}), remove_tags = [dict(name='div', attrs={'class':[ 'metroCommentFormWrap', 'related-links'
dict(name='div', attrs={'id':['date']}),
dict(name='h1', attrs={'class':['title']}),
dict(name='h2', attrs={'class':['subtitle']})]
remove_tags = [dict(name='div', attrs={'class':[ 'metroCommentFormWrap',
'commentForm', 'metroCommentInnerWrap', 'article-slideshow-counter-container', 'article-slideshow-control', 'ad', 'header-links', 'commentForm', 'metroCommentInnerWrap', 'article-slideshow-counter-container', 'article-slideshow-control', 'ad', 'header-links',
'art-rgt','pluck-app pluck-comm', 'share-and-byline', 'article-tools-below-title', 'col-179 ', 'related-links', 'clear padding-top-15', 'share-tools', 'article-page-auto-pushes', 'footer-edit']}), 'art-rgt','pluck-app pluck-comm', 'share-and-byline', 'article-tools-below-title', 'col-179 ', 'related-links', 'clear padding-top-15', 'share-tools',
'article1','article-page-auto-pushes', 'footer-edit','clear']}),
dict(name='div', attrs={'id':['article-2', 'article-4', 'article-1', 'navigation', 'footer', 'header', 'comments', 'sidebar', 'share-and-byline']}), dict(name='div', attrs={'id':['article-2', 'article-4', 'article-1', 'navigation', 'footer', 'header', 'comments', 'sidebar', 'share-and-byline']}),
dict(name='iframe')] dict(name='iframe')]
@ -70,26 +78,8 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
iurl = tag['src'] iurl = tag['src']
img = Image() img = Image()
img.open(iurl) img.open(iurl)
#width, height = img.size
#print '***img is: ', iurl, '\n****width is: ', width, 'height is: ', height
img.trim(0) img.trim(0)
img.save(iurl) img.save(iurl)
'''
#width, height = img.size
#print '***TRIMMED img width is: ', width, 'height is: ', height
left=0
top=0
border_color='#ffffff'
width, height = img.size
#print '***retrieved img width is: ', width, 'height is: ', height
height_correction = 1.17
canvas = create_canvas(width, height*height_correction,border_color)
canvas.compose(img, left, top)
#img = canvas
canvas.save(iurl)
#width, height = canvas.size
#print '***NEW img width is: ', width, 'height is: ', height
'''
return soup return soup
feeds = [ feeds = [