diff --git a/recipes/nrc.nl.recipe b/recipes/nrc.nl.recipe
index cab485e846..5954092a61 100644
--- a/recipes/nrc.nl.recipe
+++ b/recipes/nrc.nl.recipe
@@ -2,11 +2,11 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
import datetime
import json
-from time import sleep
from mechanize import Request
from contextlib import closing
import re
+
class NRC(BasicNewsRecipe):
title = 'NRC'
__author__ = 'Cristi Ghera'
@@ -17,18 +17,24 @@ class NRC(BasicNewsRecipe):
country = 'NL'
category = 'news, politics, Netherlands'
resolve_internal_links = True
- remove_tags_before = {'class':'article__header-and-content'}
- remove_tags_after = {'class':'article__header-and-content'}
+ remove_tags_before = {'class': 'article__header-and-content'}
+ remove_tags_after = {'class': 'article__header-and-content'}
remove_tags = [
- dict(attrs={'class':['article__footer',
- 'lees-ook',
- 'luister-naar',
- 'print-layout-warning',
- 'newslettersignup',
- 'article__byline',
- 'article__published-in',
- 'article__featured-image__caption__producer',
- 'metabox',]}),
+ dict(
+ attrs={
+ 'class': [
+ 'article__footer',
+ 'lees-ook',
+ 'luister-naar',
+ 'print-layout-warning',
+ 'newslettersignup',
+ 'article__byline',
+ 'article__published-in',
+ 'article__featured-image__caption__producer',
+ 'metabox',
+ ]
+ }
+ ),
dict(name=['script', 'noscript', 'style']),
]
remove_attributes = ["class", "id", "name", "style"]
@@ -36,24 +42,26 @@ class NRC(BasicNewsRecipe):
no_stylesheets = True
ignore_duplicate_articles = {'url'}
delay = 0.3
-
+
touchscreen = True
-
+
frontpage = None
-
+
title_regexp = None
-
+
@staticmethod
def _monthly_list_url(date, fmt="%Y/%m/"):
return "https://www.nrc.nl/de/data/NH/" + date.strftime(fmt)
-
+
def _clean_article_title(self, title):
if not title:
return title
if self.title_regexp is None:
- self.title_regexp = re.compile(r'([^<]+)\s*')
+ self.title_regexp = re.compile(
+ r'([^<]+)\s*'
+ )
return self.title_regexp.sub(r"\1 ", title)
-
+
def parse_index(self):
sections = []
today = datetime.date.today()
@@ -64,15 +72,22 @@ class NRC(BasicNewsRecipe):
}
monthly_list_urls = [
self._monthly_list_url(today),
- self._monthly_list_url(datetime.date(today.year, today.month, 1) - datetime.timedelta(days=1))
+ self._monthly_list_url(
+ datetime.date(today.year, today.month, 1) -
+ datetime.timedelta(days=1)
+ )
]
issue_url = None
issue_date = None
for monthly_list_url in monthly_list_urls:
- with closing(self.browser.open(Request(monthly_list_url, None, headers))) as r:
+ with closing(
+ self.browser.open(Request(monthly_list_url, None, headers))
+ ) as r:
issues = json.loads(r.read())
if len(issues) > 0:
- issue_date = datetime.datetime.strptime(issues[0]["published_at"], "%Y-%m-%dT%H:%M:%SZ")
+ issue_date = datetime.datetime.strptime(
+ issues[0]["published_at"], "%Y-%m-%dT%H:%M:%SZ"
+ )
issue_url = self._monthly_list_url(issue_date, "%Y/%m/%d/")
self.frontpage = issues[0]["frontpage"]
break
@@ -93,14 +108,12 @@ class NRC(BasicNewsRecipe):
if doc not in documents:
self.log.warn('Document not found:', doc)
continue
- articles.append(dict(
- title=documents[doc]["headline"],
- url=documents[doc]["url"]
- ))
- sections.append((
- section["name"],
- articles
- ))
+ articles.append(
+ dict(
+ title=documents[doc]["headline"], url=documents[doc]["url"]
+ )
+ )
+ sections.append((section["name"], articles))
return sections
def preprocess_html(self, soup):
@@ -119,4 +132,4 @@ class NRC(BasicNewsRecipe):
return soup
def get_cover_url(self):
- return self.frontpage
\ No newline at end of file
+ return self.frontpage
diff --git a/recipes/volksrant.recipe b/recipes/volksrant.recipe
index 90338ec2fb..18741ee0a4 100644
--- a/recipes/volksrant.recipe
+++ b/recipes/volksrant.recipe
@@ -2,6 +2,7 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
import uuid
+
class Volkskrant(BasicNewsRecipe):
title = 'Volkskrant'
__author__ = 'Cristi Ghera'
@@ -10,9 +11,20 @@ class Volkskrant(BasicNewsRecipe):
needs_subscription = False
resolve_internal_links = True
remove_tags_before = dict(id='main-content')
- remove_tags_after = dict(id='main-content')
+ remove_tags_after = dict(id='main-content')
remove_tags = [
- dict(attrs={'class':['article-footer__sharing', 'artstyle__editorial-tips', 'artstyle__advertisement','artstyle__container__icon','artstyle__disabled-embed','container__title__icon',]}),
+ dict(
+ attrs={
+ 'class': [
+ 'article-footer__sharing',
+ 'artstyle__editorial-tips',
+ 'artstyle__advertisement',
+ 'artstyle__container__icon',
+ 'artstyle__disabled-embed',
+ 'container__title__icon',
+ ]
+ }
+ ),
dict(attrs={'data-element-id': ['article-element-authors']}),
dict(name=['script', 'noscript', 'style']),
]
@@ -20,15 +32,17 @@ class Volkskrant(BasicNewsRecipe):
encoding = 'utf-8'
no_stylesheets = True
ignore_duplicate_articles = {'url'}
-
+
def parse_index(self):
- soup = self.index_to_soup('https://www.volkskrant.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4()))
+ soup = self.index_to_soup(
+ 'https://www.volkskrant.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4())
+ )
containers = soup.findAll('section', attrs={'class': 'section--horizontal'})
sections = []
for container in containers:
section_title = self.tag_to_string(container.find('h2')).strip()
articles = []
-
+
for art in container.findAll('article'):
a = art.find('a')
url = a['href']
@@ -37,9 +51,18 @@ class Volkskrant(BasicNewsRecipe):
if '/editie/' not in url:
continue
header = a.find('header')
- teaser_label = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__label'})).strip()
- teaser_sublabel = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__sublabel'})).strip()
- teaser_title = self.tag_to_string(header.find('h3').find('span', attrs={'class': 'teaser__title__value--short'})).strip()
+ teaser_label = self.tag_to_string(
+ header.find('h4').find('span', attrs={'class': 'teaser__label'})
+ ).strip()
+ teaser_sublabel = self.tag_to_string(
+ header.find('h4'
+ ).find('span', attrs={'class': 'teaser__sublabel'})
+ ).strip()
+ teaser_title = self.tag_to_string(
+ header.find('h3').find(
+ 'span', attrs={'class': 'teaser__title__value--short'}
+ )
+ ).strip()
if teaser_label.lower() == "podcast":
continue
parts = []
@@ -52,12 +75,16 @@ class Volkskrant(BasicNewsRecipe):
article_title = ' \u2022 '.join(parts)
pubdate = ''
description = ''
- articles.append(dict(title=article_title,
- url=url,
- date=pubdate,
- description=description,
- content=''))
-
+ articles.append(
+ dict(
+ title=article_title,
+ url=url,
+ date=pubdate,
+ description=description,
+ content=''
+ )
+ )
+
sections.append((section_title, articles))
return sections
@@ -66,4 +93,4 @@ class Volkskrant(BasicNewsRecipe):
if tag.name == 'img':
if tag['src'][0] == '/':
tag['src'] = 'https://www.volkskrant.nl' + tag['src']
- return soup
\ No newline at end of file
+ return soup