diff --git a/recipes/metro_news_nl.recipe b/recipes/metro_news_nl.recipe index 07b08bd5e5..5001e9992b 100644 --- a/recipes/metro_news_nl.recipe +++ b/recipes/metro_news_nl.recipe @@ -2,7 +2,7 @@ from calibre.web.feeds.news import BasicNewsRecipe import re from calibre.utils.magick import Image -from BeautifulSoup import BeautifulSoup +from calibre.ebooks.BeautifulSoup import BeautifulSoup ''' Version 1.2, updated cover image to match the changed website. added info date on title @@ -61,13 +61,13 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe): cover_url = 'http://www.readmetro.com/en/holland/metro-holland/image/large/last/' publication_type = 'newspaper' encoding = 'utf-8' - remove_attributes = ['style', 'font', 'width', 'height', 'itemtype', 'itemprop', 'itemscope']#, 'href'] + remove_attributes = ['style', 'font', 'width', 'height', 'itemtype', 'itemprop', 'itemscope'] # , 'href'] use_embedded_content = False extra_css = 'body{font-size:1em;padding:5px 0}body,a,h2{background-color:#fff;text-decoration:none;color:#000}#date,div.byline,p.article-image-caption .credits,.calibrenavbar,.calibre5{font-size:.5em}.article-box-fact.module-title,#date,div.byline{clear:both}.article-box-fact{font-size:0.7em}.article-box-fact.module-title{margin:8px 0; font-size:0.8em}h2{font-size:1em}h1.title{font-size:1.4em}h1.title,.article-body p,div.article-image-caption-2column,div.article-image-caption-3column,#date,div.byline{margin-bottom:.6em}div.article-box-fact div.subtitle,.article-box-fact.module-title,h1.title,p.article-image-caption{font-weight:700}div.column-1-3{margin-left:19px}div.column-1-2{display:inline}div.column-1-2,div.column-1-3{margin-right:7px}p.article-image-caption{font-size:.6em;margin-top:5px}p.article-image-caption,#date,div.byline{color:#616262}p.article-image-caption .credits{font-style:italic}div.article-image-caption{width:246px}div.article-image-caption-2column{width:373px}div.column-3{background-color:#eee;float:right;width:50%}div.column-3 module-title{border:1px solid #aaa}div.article-box-fact div.subtitle,.article-box-fact.module-title{color:#24763b}div.byline{border-top:2px solid #24763b}div.column-3,img,div.column-3,p.small,div.article-image-caption{margin:.5em}img,p.small,.column1,h2,.calibre5,.calibrenavbar{border:0;padding:0}.column1,h1,h2,.calibrenavbar{margin:0}' - preprocess_regexps = [ - (re.compile(r'( |\s|]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)', re.DOTALL|re.IGNORECASE),lambda match: ' '), + (re.compile(r'( |\s|]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)', + re.DOTALL|re.IGNORECASE),lambda match: ' '), #(re.compile(r'( |\s)+', re.DOTALL|re.IGNORECASE),lambda match:' '), #(re.compile(r'<(a |/a)[^>]*>', re.DOTALL|re.IGNORECASE),lambda match:'') #(re.compile('(.*?', re.DOTALL|re.IGNORECASE), lambda match: '' + match.group().replace('','').strip().replace('','').strip() + ''), + (re.compile(r'.*?', re.DOTALL|re.IGNORECASE), lambda match: '' + + match.group().replace('','').strip().replace('','').strip() + ''), ] - keep_only_tags = [dict(name='div', attrs={'class':['article']}),dict(name='td', attrs={'class':['txt_articulo']})] remove_tags = [ @@ -51,65 +49,63 @@ class RevistaMuyInteresante(BasicNewsRecipe): remove_tags_after = dict(name='div', attrs={'class':'tags_articles'}) - - #TO GET ARTICLES IN SECTION + # TO GET ARTICLES IN SECTION def nz_parse_section(self, url): - soup = self.index_to_soup(url) - div = soup.find(attrs={'class':'contenido'}) - current_articles = [] - for x in div.findAllNext(attrs={'class':['headline']}): - a = x.find('a', href=True) - if a is None: - continue - title = self.tag_to_string(a) - url = a.get('href', False) - if not url or not title: - continue - if url.startswith('/'): - url = 'http://www.muyinteresante.es'+url + soup = self.index_to_soup(url) + div = soup.find(attrs={'class':'contenido'}) + current_articles = [] + for x in div.findAllNext(attrs={'class':['headline']}): + a = x.find('a', href=True) + if a is None: + continue + title = self.tag_to_string(a) + url = a.get('href', False) + if not url or not title: + continue + if url.startswith('/'): + url = 'http://www.muyinteresante.es'+url # self.log('\t\tFound article:', title) # self.log('\t\t\t', url) - current_articles.append({'title': title, 'url':url, - 'description':'', 'date':''}) - - return current_articles + current_articles.append({'title': title, 'url':url, + 'description':'', 'date':''}) + return current_articles # To GET SECTIONS def parse_index(self): - feeds = [] - for title, url in [ - ('Historia', - 'http://www.muyinteresante.es/historia-articulos'), - ('Ciencia', - 'http://www.muyinteresante.es/ciencia-articulos'), - ('Naturaleza', - 'http://www.muyinteresante.es/naturaleza-articulos'), - ('Tecnología', - 'http://www.muyinteresante.es/tecnologia-articulos'), - ('Salud', - 'http://www.muyinteresante.es/salud-articulos'), - ('Más Muy', - 'http://www.muyinteresante.es/muy'), - ('Innova - Automoción', - 'http://www.muyinteresante.es/articulos-innovacion-autos'), - ('Innova - Salud', - 'http://www.muyinteresante.es/articulos-innovacion-salud'), - ('Innova - Medio Ambiente', - 'http://www.muyinteresante.es/articulos-innovacion-medio-ambiente'), - ('Innova - Alimentación', - 'http://www.muyinteresante.es/articulos-innovacion-alimentacion'), - ('Innova - Sociedad', - 'http://www.muyinteresante.es/articulos-innovacion-sociedad'), - ('Innova - Tecnología', - 'http://www.muyinteresante.es/articulos-innovacion-tecnologia'), - ('Innova - Ocio', - 'http://www.muyinteresante.es/articulos-innovacion-ocio'), - ]: - articles = self.nz_parse_section(url) - if articles: - feeds.append((title, articles)) - return feeds + feeds = [] + for title, url in [ + ('Historia', + 'http://www.muyinteresante.es/historia-articulos'), + ('Ciencia', + 'http://www.muyinteresante.es/ciencia-articulos'), + ('Naturaleza', + 'http://www.muyinteresante.es/naturaleza-articulos'), + ('Tecnología', + 'http://www.muyinteresante.es/tecnologia-articulos'), + ('Salud', + 'http://www.muyinteresante.es/salud-articulos'), + ('Más Muy', + 'http://www.muyinteresante.es/muy'), + ('Innova - Automoción', + 'http://www.muyinteresante.es/articulos-innovacion-autos'), + ('Innova - Salud', + 'http://www.muyinteresante.es/articulos-innovacion-salud'), + ('Innova - Medio Ambiente', + 'http://www.muyinteresante.es/articulos-innovacion-medio-ambiente'), + ('Innova - Alimentación', + 'http://www.muyinteresante.es/articulos-innovacion-alimentacion'), + ('Innova - Sociedad', + 'http://www.muyinteresante.es/articulos-innovacion-sociedad'), + ('Innova - Tecnología', + 'http://www.muyinteresante.es/articulos-innovacion-tecnologia'), + ('Innova - Ocio', + 'http://www.muyinteresante.es/articulos-innovacion-ocio'), + ]: + articles = self.nz_parse_section(url) + if articles: + feeds.append((title, articles)) + return feeds def get_cover_url(self): index = 'http://www.muyinteresante.es/revista' @@ -118,5 +114,3 @@ class RevistaMuyInteresante(BasicNewsRecipe): if link_item: cover_url = "http://www.muyinteresante.es"+link_item['src'] return cover_url - -