calibre/recipes/metro_news_nl.recipe
Kovid Goyal ba59ac679d
Fix incorrect soup usage in various recipes
Also make SoupStrainer available in calibre.ebooks.BeautifulSoup
2019-03-25 10:17:27 +05:30

233 lines
11 KiB
Plaintext

# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import print_function
from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.utils.magick import Image
''' Version 1.2, updated cover image to match the changed website.
added info date on title
version 1.4 Updated tags, delay and added autoclean 22-09-2011
version 1.5 Changes due to changes in site
version 1.6 Added css, removed auto cleanup, added buitenland section, added use_embedded_content, added remove_attributes
Added some processing on pictures
Removed links in html
Removed extre white characters
changed handling of self closing span
Version 1.7 11-11-2011 Changed oldest_article back to 1.5
changed è into è
updated remove tags
removed keep_only tags
Version 1.8 26-11-2022
added remove tag: article-slideshow
Version 1.9 31-1-2012
removed some left debug settings
extended timeout from 2 to 10
changed oldest article from 10 to 1.2
changed max articles from 15 to 25
Version 1.9.1 18-04-2012
removed some debug settings
updated code to match new metro-layout
Version 1.9.2 24-04-2012
updated code to match new metro-layout
Version 1.9.3 25-04-2012
Changed a lot of custom code into calibre code as the default code of calibre has become much faster since the first version fo this recipe
Added new feeds
Updated css
Changed order of regex to speedup proces
Version 1.9.3 23-05-2012
Updated Cover image
Version 1.9.4 19-04-2013
Added regex filter for mailto
Updated for new layout of metro-site
Version 1.9.5 28-05-2013
Added some extra id's and classes to remove
'''
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Metro Nieuws NL'
oldest_article = 1.2
max_articles_per_feed = 25
__author__ = u'DrMerry'
description = u'Metro Nederland v1.9.5 2013-05-28, Download nieuws van de Nederlandse editie van de krant Metro'
language = u'nl'
simultaneous_downloads = 5
masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
timeout = 10
center_navbar = True
timefmt = ' [%A, %d %b %Y]'
no_stylesheets = True
remove_javascript = True
remove_empty_feeds = True
cover_url = 'http://www.readmetro.com/en/holland/metro-holland/image/large/last/'
publication_type = 'newspaper'
encoding = 'utf-8'
remove_attributes = ['style', 'font', 'width', 'height',
'itemtype', 'itemprop', 'itemscope'] # , 'href']
use_embedded_content = False
extra_css = 'body{font-size:1em;padding:5px 0}body,a,h2{background-color:#fff;text-decoration:none;color:#000}#date,div.byline,p.article-image-caption .credits,.calibrenavbar,.calibre5{font-size:.5em}.article-box-fact.module-title,#date,div.byline{clear:both}.article-box-fact{font-size:0.7em}.article-box-fact.module-title{margin:8px 0; font-size:0.8em}h2{font-size:1em}h1.title{font-size:1.4em}h1.title,.article-body p,div.article-image-caption-2column,div.article-image-caption-3column,#date,div.byline{margin-bottom:.6em}div.article-box-fact div.subtitle,.article-box-fact.module-title,h1.title,p.article-image-caption{font-weight:700}div.column-1-3{margin-left:19px}div.column-1-2{display:inline}div.column-1-2,div.column-1-3{margin-right:7px}p.article-image-caption{font-size:.6em;margin-top:5px}p.article-image-caption,#date,div.byline{color:#616262}p.article-image-caption .credits{font-style:italic}div.article-image-caption{width:246px}div.article-image-caption-2column{width:373px}div.column-3{background-color:#eee;float:right;width:50%}div.column-3 module-title{border:1px solid #aaa}div.article-box-fact div.subtitle,.article-box-fact.module-title{color:#24763b}div.byline{border-top:2px solid #24763b}div.column-3,img,div.column-3,p.small,div.article-image-caption{margin:.5em}img,p.small,.column1,h2,.calibre5,.calibrenavbar{border:0;padding:0}.column1,h1,h2,.calibrenavbar{margin:0}' # noqa
preprocess_regexps = [
(re.compile(r'(&nbsp;|\s|<img[^>]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)',
re.DOTALL | re.IGNORECASE), lambda match: ' '),
]
remove_tags_before = dict(id='subwrapper')
remove_tags_after = dict(
name='div', attrs={'class': ['body-area', 'article-main-area']})
# name='div', attrs={'class':['subwrapper']})]
# 'column-1-3','gallery-text']})]#id='share-and-byline')]
filter_regexps = [r'mailto:.*']
remove_tags = [
dict(name=['iframe', 'script', 'noscript', 'style']),
dict(name='div', attrs={'class': ['fact-related-box', 'aside clearfix', 'aside clearfix middle-col-line', 'comments', 'share-tools', 'article-right-column', 'column-4-5', 'column-1-5', 'ad-msg', 'col-179 ', 'col-373 ', 'clear', 'ad', 'navigation', re.compile('share-tools(-top)?'), 'tools', 'metroCommentFormWrap', 'article-tools-below-title', 'related-links', 'padding-top-15', re.compile('^promo.*?$'), 'teaser-component', re.compile('fb(-comments|_iframe_widget)'), 'promos', 'header-links', 'promo-2']}), # noqa
dict(id=['super-carousel', 'article-2', 'googleads', 'column-1-5-bottom', 'column-4-5', re.compile('^ad(\\d+|adcomp.*?)?$'), 'adadcomp-4', 'margin-5', 'sidebar', re.compile('^article-\\d'), 'comments', 'gallery-1', 'sharez_container', 'ts-container', 'topshares', 'ts-title']), # noqa
dict(name='a', attrs={'name': 'comments'}),
dict(name='img', attrs={'class': 'top-line',
'title': 'volledig scherm'}),
dict(attrs={'style': re.compile('^(.*(display\\s?:\\s?none|img-mask|white)\\s?;?.*)$'), 'title': 'volledig scherm'})]
'''removed by before/after:
id:
column-1-5-top,'hidden_div','footer',
class:
'header',re.compile('^footer-[a-zA-Z0-9]+$),'header-links',
'''
def preprocess_html(self, soup):
myProcess = MerryProcess()
myProcess.moveTitleAndAuthor(soup)
myProcess.removeUnwantedTags(soup)
return soup
def postprocess_html(self, soup, first):
myProcess = MerryProcess()
myProcess.optimizeLayout(soup)
return soup
feeds = [
(u'Binnenland', u'http://www.metronieuws.nl/rss.xml?c=1277377288-3'),
(u'Economie', u'http://www.metronieuws.nl/rss.xml?c=1278070988-0'),
(u'Den Haag', u'http://www.metronieuws.nl/rss.xml?c=1289013337-3'),
(u'Rotterdam', u'http://www.metronieuws.nl/rss.xml?c=1289013337-2'),
(u'Amsterdam', u'http://www.metronieuws.nl/rss.xml?c=1289013337-1'),
(u'Buitenland', u'http://www.metronieuws.nl/rss.xml?c=1277377288-4'),
(u'Columns', u'http://www.metronieuws.nl/rss.xml?c=1277377288-17'),
(u'Entertainment', u'http://www.metronieuws.nl/rss.xml?c=1277377288-2'),
(u'Strips', u'http://www.metronieuws.nl/rss.xml?c=1325037714-0'),
(u'Tech', u'http://www.metronieuws.nl/rss.xml?c=1283166782-12'),
(u'Familie', u'http://www.metronieuws.nl/rss.xml?c=1283166782-9'),
(u'Blogs', u'http://www.metronieuws.nl/rss.xml?c=1295586825-6'),
(u'Reizen', u'http://www.metronieuws.nl/rss.xml?c=1277377288-13'),
(u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
(u'Wetenschap', u'http://www.metronieuws.nl/rss.xml?c=1303088437-0'),
(u'Planeet', u'http://www.metronieuws.nl/rss.xml?c=1277377288-14'),
(u'Gezondheid', u'http://www.metronieuws.nl/rss.xml?c=1277377288-15'),
(u'Sport', u'http://www.metronieuws.nl/rss.xml?c=1277377288-12')
]
class MerryPreProcess():
def optimizePicture(self, soup):
for tag in soup.findAll('img', src=True):
try:
iurl = tag['src']
img = Image()
img.open(iurl)
img.trim(0)
img.save(iurl)
except:
print('\n!!image optimize failed!!\n')
continue
return soup
class MerryExtract():
def safeRemovePart(self, killingSoup, soupIsArray):
if killingSoup and killingSoup is not None:
try:
if soupIsArray is True:
for killer in killingSoup:
killer.extract()
else:
killingSoup.extract()
except:
return False
else:
return False
return killingSoup
class MerryProcess(object):
myKiller = MerryExtract()
myPrepare = MerryPreProcess()
def optimizeLayout(self, soup):
self.myPrepare.optimizePicture(soup)
return soup
def insertFacts(self, soup):
thefactpart = re.compile('^article-box-fact.*$')
allfacts = soup.findAll('div', {'class': thefactpart})
if allfacts and allfacts is not None:
allfactsparent = soup.find('div', {'class': thefactpart}).parent
for part in allfactsparent:
if part not in allfacts:
self.myKiller.safeRemovePart(part, True)
articlefacts = soup.find('div', {'class': 'article-box-fact column'})
if (articlefacts and articlefacts is not None):
try:
contenttag = soup.find('div', {'class': 'article-body'})
foundrighttag = False
if contenttag and contenttag is not None:
foundrighttag = True
if foundrighttag is True:
contenttag.insert(0, allfactsparent)
except:
pass
return soup
def moveTitleAndAuthor(self, soup):
moveitem = soup.h1
pubdate = soup.find(id="date")
if moveitem and moveitem is not None and pubdate and pubdate is not None:
try:
pubdate.parent.insert(0, moveitem)
except:
print('\n!!error in moving title!!\n')
pass
moveitem = None
moveitem = soup.find('div', {'class': 'byline'})
if moveitem and moveitem is not None:
try:
moveitem.parent.parent.insert(-1, moveitem)
except:
print('\n!!error in moving byline!!\n')
pass
return soup
def removeUnwantedTags(self, soup):
self.insertFacts(soup)
self.removeEmptyTags(soup)
# at end to keep author
self.removeArrayOfTags(soup.findAll(
attrs={'class': 'share-tools-bottom'}))
return soup
def removeArrayOfTags(self, souparray):
return self.myKiller.safeRemovePart(souparray, True)
def removeEmptyTags(self, soup, run=0):
emptymatches = re.compile('^[&nbsp;\\s\n\r\t ]*$')
emptytags = soup.findAll(lambda tag: tag.find(True) is None and (
tag.string is None or tag.string.strip() == "" or tag.string.strip() == emptymatches) and not tag.isSelfClosing)
if emptytags and not (emptytags is None or emptytags == []):
self.removeArrayOfTags(emptytags)
# recursive in case removing empty tag creates new empty tag
self.removeEmptyTags(soup, run=run)
return soup