diff --git a/recipes/nrc.nl.recipe b/recipes/nrc.nl.recipe index cfa3751b71..cab485e846 100644 --- a/recipes/nrc.nl.recipe +++ b/recipes/nrc.nl.recipe @@ -1,55 +1,122 @@ -__license__ = 'GPL v3' -__copyright__ = '2010-2011, Darko Miletic ' -''' -nrc.nl -''' +#!/usr/bin/env python +from calibre.web.feeds.recipes import BasicNewsRecipe +import datetime +import json +from time import sleep +from mechanize import Request +from contextlib import closing +import re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag - - -def new_tag(soup, name, attrs=()): - impl = getattr(soup, 'new_tag', None) - if impl is not None: - return impl(name, attrs=dict(attrs)) - return Tag(soup, name, attrs=attrs or None) - - -class Pagina12(BasicNewsRecipe): +class NRC(BasicNewsRecipe): title = 'NRC' - __author__ = 'Darko Miletic' - description = 'News from Netherlands' - publisher = 'nrc.nl' - category = 'news, politics, Netherlands' - oldest_article = 2 - max_articles_per_feed = 200 - no_stylesheets = True - encoding = 'utf8' - use_embedded_content = False + __author__ = 'Cristi Ghera' + max_articles_per_feed = 100 + description = 'NRC - Nieuws, achtergronden en onderzoeksjournalistiek' + needs_subscription = False language = 'nl' country = 'NL' - remove_empty_feeds = True - masthead_url = 'http://www.nrc.nl/nrc.nl/images/logo_nrc.png' - - keep_only_tags = [ - dict(name=['h1', 'figure']), - dict(attrs={'class': ['intro', 'byline']}), - dict(attrs={'class': lambda x: x and 'article__content' in x}), + category = 'news, politics, Netherlands' + resolve_internal_links = True + remove_tags_before = {'class':'article__header-and-content'} + remove_tags_after = {'class':'article__header-and-content'} + remove_tags = [ + dict(attrs={'class':['article__footer', + 'lees-ook', + 'luister-naar', + 'print-layout-warning', + 'newslettersignup', + 'article__byline', + 'article__published-in', + 'article__featured-image__caption__producer', + 'metabox',]}), + dict(name=['script', 'noscript', 'style']), ] - remove_attributes = ['style'] - - feeds = ['http://www.nrc.nl/rss/'] + remove_attributes = ["class", "id", "name", "style"] + encoding = 'utf-8' + no_stylesheets = True + ignore_duplicate_articles = {'url'} + delay = 0.3 + + touchscreen = True + + frontpage = None + + title_regexp = None + + @staticmethod + def _monthly_list_url(date, fmt="%Y/%m/"): + return "https://www.nrc.nl/de/data/NH/" + date.strftime(fmt) + + def _clean_article_title(self, title): + if not title: + return title + if self.title_regexp is None: + self.title_regexp = re.compile(r'([^<]+)\s*') + return self.title_regexp.sub(r"\1 ", title) + + def parse_index(self): + sections = [] + today = datetime.date.today() + headers = { + 'X-Requested-With': 'XMLHttpRequest', + 'Accept': 'application/json, text/javascript, */*; q=0.01', + 'DNT': '1', + } + monthly_list_urls = [ + self._monthly_list_url(today), + self._monthly_list_url(datetime.date(today.year, today.month, 1) - datetime.timedelta(days=1)) + ] + issue_url = None + issue_date = None + for monthly_list_url in monthly_list_urls: + with closing(self.browser.open(Request(monthly_list_url, None, headers))) as r: + issues = json.loads(r.read()) + if len(issues) > 0: + issue_date = datetime.datetime.strptime(issues[0]["published_at"], "%Y-%m-%dT%H:%M:%SZ") + issue_url = self._monthly_list_url(issue_date, "%Y/%m/%d/") + self.frontpage = issues[0]["frontpage"] + break + if issue_url is None: + return [] + with closing(self.browser.open(Request(issue_url, None, headers))) as r: + edition = json.loads(r.read()) + documents = {} + for headline in edition["paperheadlines"]: + item = headline["item"] + documents[headline["document_id"]] = dict( + url=item["full_url"], + headline=self._clean_article_title(item["headline"]) + ) + for section in edition["sections"]: + articles = [] + for doc in section["document_ids"]: + if doc not in documents: + self.log.warn('Document not found:', doc) + continue + articles.append(dict( + title=documents[doc]["headline"], + url=documents[doc]["url"] + )) + sections.append(( + section["name"], + articles + )) + return sections def preprocess_html(self, soup): - src = None - for meta in soup.findAll('meta', itemprop='image', content=True): - src = meta['content'] - break - if src is not None: - div = soup.find( - 'div', attrs={'class': lambda x: x and 'featured-img' in x}) - if div is not None: - img = new_tag(soup, 'img') - img['src'] = src - div.append(img) + for tag in soup(): + if tag.name == 'img': + if tag.has_attr('data-src-medium'): + tag['src'] = tag['data-src-medium'].split("|")[0] + elif tag.has_attr('data-src'): + tag['src'] = tag['data-src'].split("|")[0] + if tag['src'].startswith('//'): + tag['src'] = 'https:' + tag['src'] + elif tag['src'].startswith('/'): + tag['src'] = 'https://www.nrc.nl' + tag['src'] + if self.browser.cookiejar: + self.browser.cookiejar.clear() return soup + + def get_cover_url(self): + return self.frontpage \ No newline at end of file diff --git a/recipes/volksrant.recipe b/recipes/volksrant.recipe index c73ceaa06c..90338ec2fb 100644 --- a/recipes/volksrant.recipe +++ b/recipes/volksrant.recipe @@ -1,93 +1,69 @@ -# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai -from __future__ import with_statement +#!/usr/bin/env python +from calibre.web.feeds.recipes import BasicNewsRecipe +import uuid -__license__ = 'GPL v3' -__copyright__ = '2009, Kovid Goyal ' -__docformat__ = 'restructuredtext en' - -''' - Modified by Tony Stegall - on 10/10/10 to include function to grab print version of articles -''' -from datetime import date -from calibre.web.feeds.news import BasicNewsRecipe -''' -added by Tony Stegall -''' -####################################################### -from calibre.ptempfile import PersistentTemporaryFile -####################################################### - - -class AdvancedUserRecipe1249039563(BasicNewsRecipe): - title = u'De Volkskrant' - __author__ = 'acidzebra' - oldest_article = 7 +class Volkskrant(BasicNewsRecipe): + title = 'Volkskrant' + __author__ = 'Cristi Ghera' max_articles_per_feed = 100 + description = 'Volkskrant - Nieuws, achtergronden en columns' + needs_subscription = False + resolve_internal_links = True + remove_tags_before = dict(id='main-content') + remove_tags_after = dict(id='main-content') + remove_tags = [ + dict(attrs={'class':['article-footer__sharing', 'artstyle__editorial-tips', 'artstyle__advertisement','artstyle__container__icon','artstyle__disabled-embed','container__title__icon',]}), + dict(attrs={'data-element-id': ['article-element-authors']}), + dict(name=['script', 'noscript', 'style']), + ] + remove_attributes = ["class", "id", "name", "style"] + encoding = 'utf-8' no_stylesheets = True - language = 'nl' + ignore_duplicate_articles = {'url'} + + def parse_index(self): + soup = self.index_to_soup('https://www.volkskrant.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4())) + containers = soup.findAll('section', attrs={'class': 'section--horizontal'}) + sections = [] + for container in containers: + section_title = self.tag_to_string(container.find('h2')).strip() + articles = [] + + for art in container.findAll('article'): + a = art.find('a') + url = a['href'] + if url[0] == '/': + url = 'https://www.volkskrant.nl' + url + if '/editie/' not in url: + continue + header = a.find('header') + teaser_label = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__label'})).strip() + teaser_sublabel = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__sublabel'})).strip() + teaser_title = self.tag_to_string(header.find('h3').find('span', attrs={'class': 'teaser__title__value--short'})).strip() + if teaser_label.lower() == "podcast": + continue + parts = [] + if teaser_label: + parts.append(teaser_label.upper()) + if teaser_sublabel: + parts.append(teaser_sublabel) + if teaser_title: + parts.append(teaser_title) + article_title = ' \u2022 '.join(parts) + pubdate = '' + description = '' + articles.append(dict(title=article_title, + url=url, + date=pubdate, + description=description, + content='')) + + sections.append((section_title, articles)) + return sections - extra_css = ''' - body{font-family:Arial,Helvetica,sans-serif;font-size:small;} - h1{font-size:large;} - ''' - ''' - Change Log: - Date: 10/10/10 - Modified code to include obfuscated to get the print version - Author: Tony Stegall - - Date: 01/01/11 - Modified for better results around December/January. - Author: Martin Tarenskeen - ''' - # ######################################################################### - temp_files = [] - articles_are_obfuscated = True - - def get_obfuscated_article(self, url): - br = self.browser.clone_browser() - br.open(url) - year = date.today().year - - try: - response = br.follow_link( - url_regex='.*?(%d)(\\/)(article)(\\/)(print)(\\/)' % year, nr=0) - html = response.read() - except: - year = year - 1 - try: - response = br.follow_link( - url_regex='.*?(%d)(\\/)(article)(\\/)(print)(\\/)' % year, nr=0) - html = response.read() - except: - response = br.open(url) - html = response.read() - - self.temp_files.append(PersistentTemporaryFile('_fa.html')) - self.temp_files[-1].write(html) - self.temp_files[-1].close() - return self.temp_files[-1].name - - # ######################################################################### - - ''' - Change Log: - Date: 10/15/2010 - Feeds updated by Martin Tarenskeen - Date: 09/09/2012 - Feeds updated by Eric Lammerts - ''' - - feeds = [ - (u'Nieuws', u'http://www.volkskrant.nl/nieuws/rss.xml'), - (u'Binnenland', u'http://www.volkskrant.nl/nieuws/binnenland/rss.xml'), - (u'Buitenland', u'http://www.volkskrant.nl/buitenland/rss.xml'), - (u'Economie', u'http://www.volkskrant.nl/nieuws/economie/rss.xml'), - (u'Politiek', u'http://www.volkskrant.nl/politiek/rss.xml'), - (u'Sport', u'http://www.volkskrant.nl/sport/rss.xml'), - (u'Cultuur', u'http://www.volkskrant.nl/nieuws/cultuur/rss.xml'), - (u'Gezondheid & wetenschap', - u'http://www.volkskrant.nl/nieuws/gezondheid--wetenschap/rss.xml'), - (u'Tech & Media', u'http://www.volkskrant.nl/tech-media/rss.xml'), - (u'Reizen', u'http://www.volkskrant.nl/nieuws/reizen/rss.xml'), - (u'Opinie', u'http://www.volkskrant.nl/opinie/rss.xml'), - (u'Opmerkelijk', u'http://www.volkskrant.nl/nieuws/opmerkelijk/rss.xml')] + def preprocess_html(self, soup): + for tag in soup(): + if tag.name == 'img': + if tag['src'][0] == '/': + tag['src'] = 'https://www.volkskrant.nl' + tag['src'] + return soup \ No newline at end of file