mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
136 lines
4.6 KiB
Python
136 lines
4.6 KiB
Python
#!/usr/bin/env python
|
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
import datetime
|
|
import json
|
|
from mechanize import Request
|
|
from contextlib import closing
|
|
import re
|
|
|
|
|
|
class NRC(BasicNewsRecipe):
|
|
title = 'NRC'
|
|
__author__ = 'Cristi Ghera'
|
|
max_articles_per_feed = 100
|
|
description = 'NRC - Nieuws, achtergronden en onderzoeksjournalistiek'
|
|
needs_subscription = False
|
|
language = 'nl'
|
|
country = 'NL'
|
|
category = 'news, politics, Netherlands'
|
|
resolve_internal_links = True
|
|
remove_tags_before = {'class': 'article__header-and-content'}
|
|
remove_tags_after = {'class': 'article__header-and-content'}
|
|
remove_tags = [
|
|
dict(
|
|
attrs={
|
|
'class': [
|
|
'article__footer',
|
|
'lees-ook',
|
|
'luister-naar',
|
|
'print-layout-warning',
|
|
'newslettersignup',
|
|
'article__byline',
|
|
'article__published-in',
|
|
'article__featured-image__caption__producer',
|
|
'metabox',
|
|
]
|
|
}
|
|
),
|
|
dict(name=['script', 'noscript', 'style']),
|
|
]
|
|
remove_attributes = ["class", "id", "name", "style"]
|
|
encoding = 'utf-8'
|
|
no_stylesheets = True
|
|
ignore_duplicate_articles = {'url'}
|
|
delay = 0.3
|
|
|
|
touchscreen = True
|
|
|
|
frontpage = None
|
|
|
|
title_regexp = None
|
|
|
|
@staticmethod
|
|
def _monthly_list_url(date, fmt="%Y/%m/"):
|
|
return "https://www.nrc.nl/de/data/NH/" + date.strftime(fmt)
|
|
|
|
def _clean_article_title(self, title):
|
|
if not title:
|
|
return title
|
|
if self.title_regexp is None:
|
|
self.title_regexp = re.compile(
|
|
r'<span class="keyword">([^<]+)</span>\s*'
|
|
)
|
|
return self.title_regexp.sub(r"\1 ", title)
|
|
|
|
def parse_index(self):
|
|
sections = []
|
|
today = datetime.date.today()
|
|
headers = {
|
|
'X-Requested-With': 'XMLHttpRequest',
|
|
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
|
'DNT': '1',
|
|
}
|
|
monthly_list_urls = [
|
|
self._monthly_list_url(today),
|
|
self._monthly_list_url(
|
|
datetime.date(today.year, today.month, 1) -
|
|
datetime.timedelta(days=1)
|
|
)
|
|
]
|
|
issue_url = None
|
|
issue_date = None
|
|
for monthly_list_url in monthly_list_urls:
|
|
with closing(
|
|
self.browser.open(Request(monthly_list_url, None, headers))
|
|
) as r:
|
|
issues = json.loads(r.read())
|
|
if len(issues) > 0:
|
|
issue_date = datetime.datetime.strptime(
|
|
issues[0]["published_at"], "%Y-%m-%dT%H:%M:%SZ"
|
|
)
|
|
issue_url = self._monthly_list_url(issue_date, "%Y/%m/%d/")
|
|
self.frontpage = issues[0]["frontpage"]
|
|
break
|
|
if issue_url is None:
|
|
return []
|
|
with closing(self.browser.open(Request(issue_url, None, headers))) as r:
|
|
edition = json.loads(r.read())
|
|
documents = {}
|
|
for headline in edition["paperheadlines"]:
|
|
item = headline["item"]
|
|
documents[headline["document_id"]] = dict(
|
|
url=item["full_url"],
|
|
headline=self._clean_article_title(item["headline"])
|
|
)
|
|
for section in edition["sections"]:
|
|
articles = []
|
|
for doc in section["document_ids"]:
|
|
if doc not in documents:
|
|
self.log.warn('Document not found:', doc)
|
|
continue
|
|
articles.append(
|
|
dict(
|
|
title=documents[doc]["headline"], url=documents[doc]["url"]
|
|
)
|
|
)
|
|
sections.append((section["name"], articles))
|
|
return sections
|
|
|
|
def preprocess_html(self, soup):
|
|
for tag in soup():
|
|
if tag.name == 'img':
|
|
if tag.has_attr('data-src-medium'):
|
|
tag['src'] = tag['data-src-medium'].split("|")[0]
|
|
elif tag.has_attr('data-src'):
|
|
tag['src'] = tag['data-src'].split("|")[0]
|
|
if tag['src'].startswith('//'):
|
|
tag['src'] = 'https:' + tag['src']
|
|
elif tag['src'].startswith('/'):
|
|
tag['src'] = 'https://www.nrc.nl' + tag['src']
|
|
if self.browser.cookiejar:
|
|
self.browser.cookiejar.clear()
|
|
return soup
|
|
|
|
def get_cover_url(self):
|
|
return self.frontpage
|