mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
233 lines
11 KiB
Plaintext
233 lines
11 KiB
Plaintext
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
|
from __future__ import print_function
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
import re
|
|
from calibre.utils.magick import Image
|
|
|
|
''' Version 1.2, updated cover image to match the changed website.
|
|
added info date on title
|
|
version 1.4 Updated tags, delay and added autoclean 22-09-2011
|
|
version 1.5 Changes due to changes in site
|
|
version 1.6 Added css, removed auto cleanup, added buitenland section, added use_embedded_content, added remove_attributes
|
|
Added some processing on pictures
|
|
Removed links in html
|
|
Removed extre white characters
|
|
changed handling of self closing span
|
|
Version 1.7 11-11-2011 Changed oldest_article back to 1.5
|
|
changed è into è
|
|
updated remove tags
|
|
removed keep_only tags
|
|
Version 1.8 26-11-2022
|
|
added remove tag: article-slideshow
|
|
Version 1.9 31-1-2012
|
|
removed some left debug settings
|
|
extended timeout from 2 to 10
|
|
changed oldest article from 10 to 1.2
|
|
changed max articles from 15 to 25
|
|
Version 1.9.1 18-04-2012
|
|
removed some debug settings
|
|
updated code to match new metro-layout
|
|
Version 1.9.2 24-04-2012
|
|
updated code to match new metro-layout
|
|
Version 1.9.3 25-04-2012
|
|
Changed a lot of custom code into calibre code as the default code of calibre has become much faster since the first version fo this recipe
|
|
Added new feeds
|
|
Updated css
|
|
Changed order of regex to speedup proces
|
|
Version 1.9.3 23-05-2012
|
|
Updated Cover image
|
|
Version 1.9.4 19-04-2013
|
|
Added regex filter for mailto
|
|
Updated for new layout of metro-site
|
|
Version 1.9.5 28-05-2013
|
|
Added some extra id's and classes to remove
|
|
'''
|
|
|
|
|
|
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
|
title = u'Metro Nieuws NL'
|
|
oldest_article = 1.2
|
|
max_articles_per_feed = 25
|
|
__author__ = u'DrMerry'
|
|
description = u'Metro Nederland v1.9.5 2013-05-28, Download nieuws van de Nederlandse editie van de krant Metro'
|
|
language = u'nl'
|
|
simultaneous_downloads = 5
|
|
masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
|
|
timeout = 10
|
|
center_navbar = True
|
|
timefmt = ' [%A, %d %b %Y]'
|
|
no_stylesheets = True
|
|
remove_javascript = True
|
|
remove_empty_feeds = True
|
|
cover_url = 'http://www.readmetro.com/en/holland/metro-holland/image/large/last/'
|
|
publication_type = 'newspaper'
|
|
encoding = 'utf-8'
|
|
remove_attributes = ['style', 'font', 'width', 'height',
|
|
'itemtype', 'itemprop', 'itemscope'] # , 'href']
|
|
use_embedded_content = False
|
|
extra_css = 'body{font-size:1em;padding:5px 0}body,a,h2{background-color:#fff;text-decoration:none;color:#000}#date,div.byline,p.article-image-caption .credits,.calibrenavbar,.calibre5{font-size:.5em}.article-box-fact.module-title,#date,div.byline{clear:both}.article-box-fact{font-size:0.7em}.article-box-fact.module-title{margin:8px 0; font-size:0.8em}h2{font-size:1em}h1.title{font-size:1.4em}h1.title,.article-body p,div.article-image-caption-2column,div.article-image-caption-3column,#date,div.byline{margin-bottom:.6em}div.article-box-fact div.subtitle,.article-box-fact.module-title,h1.title,p.article-image-caption{font-weight:700}div.column-1-3{margin-left:19px}div.column-1-2{display:inline}div.column-1-2,div.column-1-3{margin-right:7px}p.article-image-caption{font-size:.6em;margin-top:5px}p.article-image-caption,#date,div.byline{color:#616262}p.article-image-caption .credits{font-style:italic}div.article-image-caption{width:246px}div.article-image-caption-2column{width:373px}div.column-3{background-color:#eee;float:right;width:50%}div.column-3 module-title{border:1px solid #aaa}div.article-box-fact div.subtitle,.article-box-fact.module-title{color:#24763b}div.byline{border-top:2px solid #24763b}div.column-3,img,div.column-3,p.small,div.article-image-caption{margin:.5em}img,p.small,.column1,h2,.calibre5,.calibrenavbar{border:0;padding:0}.column1,h1,h2,.calibrenavbar{margin:0}' # noqa
|
|
|
|
preprocess_regexps = [
|
|
(re.compile(r'( |\s|<img[^>]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)',
|
|
re.DOTALL | re.IGNORECASE), lambda match: ' '),
|
|
]
|
|
|
|
remove_tags_before = dict(id='subwrapper')
|
|
remove_tags_after = dict(
|
|
name='div', attrs={'class': ['body-area', 'article-main-area']})
|
|
# name='div', attrs={'class':['subwrapper']})]
|
|
# 'column-1-3','gallery-text']})]#id='share-and-byline')]
|
|
|
|
filter_regexps = [r'mailto:.*']
|
|
|
|
remove_tags = [
|
|
dict(name=['iframe', 'script', 'noscript', 'style']),
|
|
dict(name='div', attrs={'class': ['fact-related-box', 'aside clearfix', 'aside clearfix middle-col-line', 'comments', 'share-tools', 'article-right-column', 'column-4-5', 'column-1-5', 'ad-msg', 'col-179 ', 'col-373 ', 'clear', 'ad', 'navigation', re.compile('share-tools(-top)?'), 'tools', 'metroCommentFormWrap', 'article-tools-below-title', 'related-links', 'padding-top-15', re.compile('^promo.*?$'), 'teaser-component', re.compile('fb(-comments|_iframe_widget)'), 'promos', 'header-links', 'promo-2']}), # noqa
|
|
dict(id=['super-carousel', 'article-2', 'googleads', 'column-1-5-bottom', 'column-4-5', re.compile('^ad(\\d+|adcomp.*?)?$'), 'adadcomp-4', 'margin-5', 'sidebar', re.compile('^article-\\d'), 'comments', 'gallery-1', 'sharez_container', 'ts-container', 'topshares', 'ts-title']), # noqa
|
|
dict(name='a', attrs={'name': 'comments'}),
|
|
dict(name='img', attrs={'class': 'top-line',
|
|
'title': 'volledig scherm'}),
|
|
dict(attrs={'style': re.compile('^(.*(display\\s?:\\s?none|img-mask|white)\\s?;?.*)$'), 'title': 'volledig scherm'})]
|
|
|
|
'''removed by before/after:
|
|
id:
|
|
column-1-5-top,'hidden_div','footer',
|
|
class:
|
|
'header',re.compile('^footer-[a-zA-Z0-9]+$),'header-links',
|
|
'''
|
|
|
|
def preprocess_html(self, soup):
|
|
myProcess = MerryProcess()
|
|
myProcess.moveTitleAndAuthor(soup)
|
|
myProcess.removeUnwantedTags(soup)
|
|
return soup
|
|
|
|
def postprocess_html(self, soup, first):
|
|
myProcess = MerryProcess()
|
|
myProcess.optimizeLayout(soup)
|
|
return soup
|
|
|
|
feeds = [
|
|
(u'Binnenland', u'http://www.metronieuws.nl/rss.xml?c=1277377288-3'),
|
|
(u'Economie', u'http://www.metronieuws.nl/rss.xml?c=1278070988-0'),
|
|
(u'Den Haag', u'http://www.metronieuws.nl/rss.xml?c=1289013337-3'),
|
|
(u'Rotterdam', u'http://www.metronieuws.nl/rss.xml?c=1289013337-2'),
|
|
(u'Amsterdam', u'http://www.metronieuws.nl/rss.xml?c=1289013337-1'),
|
|
(u'Buitenland', u'http://www.metronieuws.nl/rss.xml?c=1277377288-4'),
|
|
(u'Columns', u'http://www.metronieuws.nl/rss.xml?c=1277377288-17'),
|
|
(u'Entertainment', u'http://www.metronieuws.nl/rss.xml?c=1277377288-2'),
|
|
(u'Strips', u'http://www.metronieuws.nl/rss.xml?c=1325037714-0'),
|
|
(u'Tech', u'http://www.metronieuws.nl/rss.xml?c=1283166782-12'),
|
|
(u'Familie', u'http://www.metronieuws.nl/rss.xml?c=1283166782-9'),
|
|
(u'Blogs', u'http://www.metronieuws.nl/rss.xml?c=1295586825-6'),
|
|
(u'Reizen', u'http://www.metronieuws.nl/rss.xml?c=1277377288-13'),
|
|
(u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
|
|
(u'Wetenschap', u'http://www.metronieuws.nl/rss.xml?c=1303088437-0'),
|
|
(u'Planeet', u'http://www.metronieuws.nl/rss.xml?c=1277377288-14'),
|
|
(u'Gezondheid', u'http://www.metronieuws.nl/rss.xml?c=1277377288-15'),
|
|
(u'Sport', u'http://www.metronieuws.nl/rss.xml?c=1277377288-12')
|
|
]
|
|
|
|
|
|
class MerryPreProcess():
|
|
|
|
def optimizePicture(self, soup):
|
|
for tag in soup.findAll('img', src=True):
|
|
try:
|
|
iurl = tag['src']
|
|
img = Image()
|
|
img.open(iurl)
|
|
img.trim(0)
|
|
img.save(iurl)
|
|
except:
|
|
print('\n!!image optimize failed!!\n')
|
|
continue
|
|
return soup
|
|
|
|
|
|
class MerryExtract():
|
|
|
|
def safeRemovePart(self, killingSoup, soupIsArray):
|
|
if killingSoup and killingSoup is not None:
|
|
try:
|
|
if soupIsArray is True:
|
|
for killer in killingSoup:
|
|
killer.extract()
|
|
else:
|
|
killingSoup.extract()
|
|
except:
|
|
return False
|
|
else:
|
|
return False
|
|
return killingSoup
|
|
|
|
|
|
class MerryProcess(object):
|
|
myKiller = MerryExtract()
|
|
myPrepare = MerryPreProcess()
|
|
|
|
def optimizeLayout(self, soup):
|
|
self.myPrepare.optimizePicture(soup)
|
|
return soup
|
|
|
|
def insertFacts(self, soup):
|
|
thefactpart = re.compile('^article-box-fact.*$')
|
|
allfacts = soup.findAll('div', {'class': thefactpart})
|
|
if allfacts and allfacts is not None:
|
|
allfactsparent = soup.find('div', {'class': thefactpart}).parent
|
|
for part in allfactsparent:
|
|
if part not in allfacts:
|
|
self.myKiller.safeRemovePart(part, True)
|
|
articlefacts = soup.find('div', {'class': 'article-box-fact column'})
|
|
if (articlefacts and articlefacts is not None):
|
|
try:
|
|
contenttag = soup.find('div', {'class': 'article-body'})
|
|
foundrighttag = False
|
|
if contenttag and contenttag is not None:
|
|
foundrighttag = True
|
|
if foundrighttag is True:
|
|
contenttag.insert(0, allfactsparent)
|
|
except:
|
|
pass
|
|
return soup
|
|
|
|
def moveTitleAndAuthor(self, soup):
|
|
moveitem = soup.h1
|
|
pubdate = soup.find(id="date")
|
|
if moveitem and moveitem is not None and pubdate and pubdate is not None:
|
|
try:
|
|
pubdate.parent.insert(0, moveitem)
|
|
except:
|
|
print('\n!!error in moving title!!\n')
|
|
pass
|
|
moveitem = None
|
|
moveitem = soup.find('div', {'class': 'byline'})
|
|
if moveitem and moveitem is not None:
|
|
try:
|
|
moveitem.parent.parent.insert(-1, moveitem)
|
|
except:
|
|
print('\n!!error in moving byline!!\n')
|
|
pass
|
|
return soup
|
|
|
|
def removeUnwantedTags(self, soup):
|
|
self.insertFacts(soup)
|
|
self.removeEmptyTags(soup)
|
|
# at end to keep author
|
|
self.removeArrayOfTags(soup.findAll(
|
|
attrs={'class': 'share-tools-bottom'}))
|
|
return soup
|
|
|
|
def removeArrayOfTags(self, souparray):
|
|
return self.myKiller.safeRemovePart(souparray, True)
|
|
|
|
def removeEmptyTags(self, soup, run=0):
|
|
emptymatches = re.compile('^[ \\s\n\r\t ]*$')
|
|
emptytags = soup.findAll(lambda tag: tag.find(True) is None and (
|
|
tag.string is None or tag.string.strip() == "" or tag.string.strip() == emptymatches) and not tag.isSelfClosing)
|
|
if emptytags and not (emptytags is None or emptytags == []):
|
|
self.removeArrayOfTags(emptytags)
|
|
# recursive in case removing empty tag creates new empty tag
|
|
self.removeEmptyTags(soup, run=run)
|
|
return soup
|