# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai from calibre.web.feeds.news import BasicNewsRecipe import re from calibre.utils.magick import Image from calibre.ebooks.BeautifulSoup import BeautifulSoup ''' Version 1.2, updated cover image to match the changed website. added info date on title version 1.4 Updated tags, delay and added autoclean 22-09-2011 version 1.5 Changes due to changes in site version 1.6 Added css, removed auto cleanup, added buitenland section, added use_embedded_content, added remove_attributes Added some processing on pictures Removed links in html Removed extre white characters changed handling of self closing span Version 1.7 11-11-2011 Changed oldest_article back to 1.5 changed è into è updated remove tags removed keep_only tags Version 1.8 26-11-2022 added remove tag: article-slideshow Version 1.9 31-1-2012 removed some left debug settings extended timeout from 2 to 10 changed oldest article from 10 to 1.2 changed max articles from 15 to 25 Version 1.9.1 18-04-2012 removed some debug settings updated code to match new metro-layout Version 1.9.2 24-04-2012 updated code to match new metro-layout Version 1.9.3 25-04-2012 Changed a lot of custom code into calibre code as the default code of calibre has become much faster since the first version fo this recipe Added new feeds Updated css Changed order of regex to speedup proces Version 1.9.3 23-05-2012 Updated Cover image Version 1.9.4 19-04-2013 Added regex filter for mailto Updated for new layout of metro-site Version 1.9.5 28-05-2013 Added some extra id's and classes to remove ''' class AdvancedUserRecipe1306097511(BasicNewsRecipe): title = u'Metro Nieuws NL' oldest_article = 1.2 max_articles_per_feed = 25 __author__ = u'DrMerry' description = u'Metro Nederland v1.9.5 2013-05-28, Download nieuws van de Nederlandse editie van de krant Metro' language = u'nl' simultaneous_downloads = 5 masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif' timeout = 10 center_navbar = True timefmt = ' [%A, %d %b %Y]' no_stylesheets = True remove_javascript = True remove_empty_feeds = True cover_url = 'http://www.readmetro.com/en/holland/metro-holland/image/large/last/' publication_type = 'newspaper' encoding = 'utf-8' remove_attributes = ['style', 'font', 'width', 'height', 'itemtype', 'itemprop', 'itemscope'] # , 'href'] use_embedded_content = False extra_css = 'body{font-size:1em;padding:5px 0}body,a,h2{background-color:#fff;text-decoration:none;color:#000}#date,div.byline,p.article-image-caption .credits,.calibrenavbar,.calibre5{font-size:.5em}.article-box-fact.module-title,#date,div.byline{clear:both}.article-box-fact{font-size:0.7em}.article-box-fact.module-title{margin:8px 0; font-size:0.8em}h2{font-size:1em}h1.title{font-size:1.4em}h1.title,.article-body p,div.article-image-caption-2column,div.article-image-caption-3column,#date,div.byline{margin-bottom:.6em}div.article-box-fact div.subtitle,.article-box-fact.module-title,h1.title,p.article-image-caption{font-weight:700}div.column-1-3{margin-left:19px}div.column-1-2{display:inline}div.column-1-2,div.column-1-3{margin-right:7px}p.article-image-caption{font-size:.6em;margin-top:5px}p.article-image-caption,#date,div.byline{color:#616262}p.article-image-caption .credits{font-style:italic}div.article-image-caption{width:246px}div.article-image-caption-2column{width:373px}div.column-3{background-color:#eee;float:right;width:50%}div.column-3 module-title{border:1px solid #aaa}div.article-box-fact div.subtitle,.article-box-fact.module-title{color:#24763b}div.byline{border-top:2px solid #24763b}div.column-3,img,div.column-3,p.small,div.article-image-caption{margin:.5em}img,p.small,.column1,h2,.calibre5,.calibrenavbar{border:0;padding:0}.column1,h1,h2,.calibrenavbar{margin:0}' # noqa preprocess_regexps = [ (re.compile(r'( |\s|]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)', re.DOTALL | re.IGNORECASE), lambda match: ' '), ] remove_tags_before = dict(id='subwrapper') remove_tags_after = dict( name='div', attrs={'class': ['body-area', 'article-main-area']}) # name='div', attrs={'class':['subwrapper']})] # 'column-1-3','gallery-text']})]#id='share-and-byline')] filter_regexps = [r'mailto:.*'] remove_tags = [ dict(name=['iframe', 'script', 'noscript', 'style']), dict(name='div', attrs={'class': ['fact-related-box', 'aside clearfix', 'aside clearfix middle-col-line', 'comments', 'share-tools', 'article-right-column', 'column-4-5', 'column-1-5', 'ad-msg', 'col-179 ', 'col-373 ', 'clear', 'ad', 'navigation', re.compile('share-tools(-top)?'), 'tools', 'metroCommentFormWrap', 'article-tools-below-title', 'related-links', 'padding-top-15', re.compile('^promo.*?$'), 'teaser-component', re.compile('fb(-comments|_iframe_widget)'), 'promos', 'header-links', 'promo-2']}), # noqa dict(id=['super-carousel', 'article-2', 'googleads', 'column-1-5-bottom', 'column-4-5', re.compile('^ad(\d+|adcomp.*?)?$'), 'adadcomp-4', 'margin-5', 'sidebar', re.compile('^article-\d'), 'comments', 'gallery-1', 'sharez_container', 'ts-container', 'topshares', 'ts-title']), # noqa dict(name='a', attrs={'name': 'comments'}), dict(name='img', attrs={'class': 'top-line', 'title': 'volledig scherm'}), dict(attrs={'style': re.compile('^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$'), 'title': 'volledig scherm'})] '''removed by before/after: id: column-1-5-top,'hidden_div','footer', class: 'header',re.compile('^footer-[a-zA-Z0-9]+$),'header-links', ''' def preprocess_html(self, soup): myProcess = MerryProcess() myProcess.moveTitleAndAuthor(soup) myProcess.removeUnwantedTags(soup) return soup def postprocess_html(self, soup, first): myProcess = MerryProcess() myProcess.optimizeLayout(soup) return soup feeds = [ (u'Binnenland', u'http://www.metronieuws.nl/rss.xml?c=1277377288-3'), (u'Economie', u'http://www.metronieuws.nl/rss.xml?c=1278070988-0'), (u'Den Haag', u'http://www.metronieuws.nl/rss.xml?c=1289013337-3'), (u'Rotterdam', u'http://www.metronieuws.nl/rss.xml?c=1289013337-2'), (u'Amsterdam', u'http://www.metronieuws.nl/rss.xml?c=1289013337-1'), (u'Buitenland', u'http://www.metronieuws.nl/rss.xml?c=1277377288-4'), (u'Columns', u'http://www.metronieuws.nl/rss.xml?c=1277377288-17'), (u'Entertainment', u'http://www.metronieuws.nl/rss.xml?c=1277377288-2'), (u'Strips', u'http://www.metronieuws.nl/rss.xml?c=1325037714-0'), (u'Tech', u'http://www.metronieuws.nl/rss.xml?c=1283166782-12'), (u'Familie', u'http://www.metronieuws.nl/rss.xml?c=1283166782-9'), (u'Blogs', u'http://www.metronieuws.nl/rss.xml?c=1295586825-6'), (u'Reizen', u'http://www.metronieuws.nl/rss.xml?c=1277377288-13'), (u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'), (u'Wetenschap', u'http://www.metronieuws.nl/rss.xml?c=1303088437-0'), (u'Planeet', u'http://www.metronieuws.nl/rss.xml?c=1277377288-14'), (u'Gezondheid', u'http://www.metronieuws.nl/rss.xml?c=1277377288-15'), (u'Sport', u'http://www.metronieuws.nl/rss.xml?c=1277377288-12') ] class MerryPreProcess(): def optimizePicture(self, soup): for tag in soup.findAll(lambda tag: tag.name.lower() == 'img' and tag.has_key('src')): # noqa try: iurl = tag['src'] img = Image() img.open(iurl) img.trim(0) img.save(iurl) except: print '\n!!image optimize failed!!\n' continue return soup class MerryExtract(): def safeRemovePart(self, killingSoup, soupIsArray): if killingSoup and killingSoup is not None: try: if soupIsArray is True: for killer in killingSoup: killer.extract() else: killingSoup.extract() except: return False else: return False return killingSoup class MerryProcess(BeautifulSoup): myKiller = MerryExtract() myPrepare = MerryPreProcess() def optimizeLayout(self, soup): self.myPrepare.optimizePicture(soup) return soup def insertFacts(self, soup): thefactpart = re.compile('^article-box-fact.*$') allfacts = soup.findAll('div', {'class': thefactpart}) if allfacts and allfacts is not None: allfactsparent = soup.find('div', {'class': thefactpart}).parent for part in allfactsparent: if part not in allfacts: self.myKiller.safeRemovePart(part, True) articlefacts = soup.find('div', {'class': 'article-box-fact column'}) if (articlefacts and articlefacts is not None): try: contenttag = soup.find('div', {'class': 'article-body'}) foundrighttag = False if contenttag and contenttag is not None: foundrighttag = True if foundrighttag is True: contenttag.insert(0, allfactsparent) except: pass return soup def moveTitleAndAuthor(self, soup): moveitem = soup.h1 pubdate = soup.find(id="date") if moveitem and moveitem is not None and pubdate and pubdate is not None: try: pubdate.parent.insert(0, moveitem) except: print '\n!!error in moving title!!\n' pass moveitem = None moveitem = soup.find('div', {'class': 'byline'}) if moveitem and moveitem is not None: try: moveitem.parent.parent.insert(-1, moveitem) except: print '\n!!error in moving byline!!\n' pass return soup def removeUnwantedTags(self, soup): self.insertFacts(soup) self.removeEmptyTags(soup) # at end to keep author self.removeArrayOfTags(soup.findAll( attrs={'class': 'share-tools-bottom'})) return soup def removeArrayOfTags(self, souparray): return self.myKiller.safeRemovePart(souparray, True) def removeEmptyTags(self, soup, run=0): emptymatches = re.compile('^[ \s\n\r\t ]*$') emptytags = soup.findAll(lambda tag: tag.find(True) is None and ( tag.string is None or tag.string.strip() == "" or tag.string.strip() == emptymatches) and not tag.isSelfClosing) if emptytags and not (emptytags is None or emptytags == []): self.removeArrayOfTags(emptytags) # recursive in case removing empty tag creates new empty tag self.removeEmptyTags(soup, run=run) return soup