calibre/recipes/nrc.nl.recipe
Kovid Goyal 65c55a6d44
pep8
2022-06-30 21:36:14 +05:30

136 lines
4.6 KiB
Python

#!/usr/bin/env python
from calibre.web.feeds.recipes import BasicNewsRecipe
import datetime
import json
from mechanize import Request
from contextlib import closing
import re
class NRC(BasicNewsRecipe):
title = 'NRC'
__author__ = 'Cristi Ghera'
max_articles_per_feed = 100
description = 'NRC - Nieuws, achtergronden en onderzoeksjournalistiek'
needs_subscription = False
language = 'nl'
country = 'NL'
category = 'news, politics, Netherlands'
resolve_internal_links = True
remove_tags_before = {'class': 'article__header-and-content'}
remove_tags_after = {'class': 'article__header-and-content'}
remove_tags = [
dict(
attrs={
'class': [
'article__footer',
'lees-ook',
'luister-naar',
'print-layout-warning',
'newslettersignup',
'article__byline',
'article__published-in',
'article__featured-image__caption__producer',
'metabox',
]
}
),
dict(name=['script', 'noscript', 'style']),
]
remove_attributes = ["class", "id", "name", "style"]
encoding = 'utf-8'
no_stylesheets = True
ignore_duplicate_articles = {'url'}
delay = 0.3
touchscreen = True
frontpage = None
title_regexp = None
@staticmethod
def _monthly_list_url(date, fmt="%Y/%m/"):
return "https://www.nrc.nl/de/data/NH/" + date.strftime(fmt)
def _clean_article_title(self, title):
if not title:
return title
if self.title_regexp is None:
self.title_regexp = re.compile(
r'<span class="keyword">([^<]+)</span>\s*'
)
return self.title_regexp.sub(r"\1 ", title)
def parse_index(self):
sections = []
today = datetime.date.today()
headers = {
'X-Requested-With': 'XMLHttpRequest',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'DNT': '1',
}
monthly_list_urls = [
self._monthly_list_url(today),
self._monthly_list_url(
datetime.date(today.year, today.month, 1) -
datetime.timedelta(days=1)
)
]
issue_url = None
issue_date = None
for monthly_list_url in monthly_list_urls:
with closing(
self.browser.open(Request(monthly_list_url, None, headers))
) as r:
issues = json.loads(r.read())
if len(issues) > 0:
issue_date = datetime.datetime.strptime(
issues[0]["published_at"], "%Y-%m-%dT%H:%M:%SZ"
)
issue_url = self._monthly_list_url(issue_date, "%Y/%m/%d/")
self.frontpage = issues[0]["frontpage"]
break
if issue_url is None:
return []
with closing(self.browser.open(Request(issue_url, None, headers))) as r:
edition = json.loads(r.read())
documents = {}
for headline in edition["paperheadlines"]:
item = headline["item"]
documents[headline["document_id"]] = dict(
url=item["full_url"],
headline=self._clean_article_title(item["headline"])
)
for section in edition["sections"]:
articles = []
for doc in section["document_ids"]:
if doc not in documents:
self.log.warn('Document not found:', doc)
continue
articles.append(
dict(
title=documents[doc]["headline"], url=documents[doc]["url"]
)
)
sections.append((section["name"], articles))
return sections
def preprocess_html(self, soup):
for tag in soup():
if tag.name == 'img':
if tag.has_attr('data-src-medium'):
tag['src'] = tag['data-src-medium'].split("|")[0]
elif tag.has_attr('data-src'):
tag['src'] = tag['data-src'].split("|")[0]
if tag['src'].startswith('//'):
tag['src'] = 'https:' + tag['src']
elif tag['src'].startswith('/'):
tag['src'] = 'https://www.nrc.nl' + tag['src']
if self.browser.cookiejar:
self.browser.cookiejar.clear()
return soup
def get_cover_url(self):
return self.frontpage