This commit is contained in:
Kovid Goyal 2022-06-30 21:36:14 +05:30
parent 248fbd3192
commit 65c55a6d44
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 86 additions and 46 deletions

View File

@ -2,11 +2,11 @@
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
import datetime import datetime
import json import json
from time import sleep
from mechanize import Request from mechanize import Request
from contextlib import closing from contextlib import closing
import re import re
class NRC(BasicNewsRecipe): class NRC(BasicNewsRecipe):
title = 'NRC' title = 'NRC'
__author__ = 'Cristi Ghera' __author__ = 'Cristi Ghera'
@ -17,18 +17,24 @@ class NRC(BasicNewsRecipe):
country = 'NL' country = 'NL'
category = 'news, politics, Netherlands' category = 'news, politics, Netherlands'
resolve_internal_links = True resolve_internal_links = True
remove_tags_before = {'class':'article__header-and-content'} remove_tags_before = {'class': 'article__header-and-content'}
remove_tags_after = {'class':'article__header-and-content'} remove_tags_after = {'class': 'article__header-and-content'}
remove_tags = [ remove_tags = [
dict(attrs={'class':['article__footer', dict(
'lees-ook', attrs={
'luister-naar', 'class': [
'print-layout-warning', 'article__footer',
'newslettersignup', 'lees-ook',
'article__byline', 'luister-naar',
'article__published-in', 'print-layout-warning',
'article__featured-image__caption__producer', 'newslettersignup',
'metabox',]}), 'article__byline',
'article__published-in',
'article__featured-image__caption__producer',
'metabox',
]
}
),
dict(name=['script', 'noscript', 'style']), dict(name=['script', 'noscript', 'style']),
] ]
remove_attributes = ["class", "id", "name", "style"] remove_attributes = ["class", "id", "name", "style"]
@ -51,7 +57,9 @@ class NRC(BasicNewsRecipe):
if not title: if not title:
return title return title
if self.title_regexp is None: if self.title_regexp is None:
self.title_regexp = re.compile(r'<span class="keyword">([^<]+)</span>\s*') self.title_regexp = re.compile(
r'<span class="keyword">([^<]+)</span>\s*'
)
return self.title_regexp.sub(r"\1 ", title) return self.title_regexp.sub(r"\1 ", title)
def parse_index(self): def parse_index(self):
@ -64,15 +72,22 @@ class NRC(BasicNewsRecipe):
} }
monthly_list_urls = [ monthly_list_urls = [
self._monthly_list_url(today), self._monthly_list_url(today),
self._monthly_list_url(datetime.date(today.year, today.month, 1) - datetime.timedelta(days=1)) self._monthly_list_url(
datetime.date(today.year, today.month, 1) -
datetime.timedelta(days=1)
)
] ]
issue_url = None issue_url = None
issue_date = None issue_date = None
for monthly_list_url in monthly_list_urls: for monthly_list_url in monthly_list_urls:
with closing(self.browser.open(Request(monthly_list_url, None, headers))) as r: with closing(
self.browser.open(Request(monthly_list_url, None, headers))
) as r:
issues = json.loads(r.read()) issues = json.loads(r.read())
if len(issues) > 0: if len(issues) > 0:
issue_date = datetime.datetime.strptime(issues[0]["published_at"], "%Y-%m-%dT%H:%M:%SZ") issue_date = datetime.datetime.strptime(
issues[0]["published_at"], "%Y-%m-%dT%H:%M:%SZ"
)
issue_url = self._monthly_list_url(issue_date, "%Y/%m/%d/") issue_url = self._monthly_list_url(issue_date, "%Y/%m/%d/")
self.frontpage = issues[0]["frontpage"] self.frontpage = issues[0]["frontpage"]
break break
@ -93,14 +108,12 @@ class NRC(BasicNewsRecipe):
if doc not in documents: if doc not in documents:
self.log.warn('Document not found:', doc) self.log.warn('Document not found:', doc)
continue continue
articles.append(dict( articles.append(
title=documents[doc]["headline"], dict(
url=documents[doc]["url"] title=documents[doc]["headline"], url=documents[doc]["url"]
)) )
sections.append(( )
section["name"], sections.append((section["name"], articles))
articles
))
return sections return sections
def preprocess_html(self, soup): def preprocess_html(self, soup):

View File

@ -2,6 +2,7 @@
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
import uuid import uuid
class Volkskrant(BasicNewsRecipe): class Volkskrant(BasicNewsRecipe):
title = 'Volkskrant' title = 'Volkskrant'
__author__ = 'Cristi Ghera' __author__ = 'Cristi Ghera'
@ -10,9 +11,20 @@ class Volkskrant(BasicNewsRecipe):
needs_subscription = False needs_subscription = False
resolve_internal_links = True resolve_internal_links = True
remove_tags_before = dict(id='main-content') remove_tags_before = dict(id='main-content')
remove_tags_after = dict(id='main-content') remove_tags_after = dict(id='main-content')
remove_tags = [ remove_tags = [
dict(attrs={'class':['article-footer__sharing', 'artstyle__editorial-tips', 'artstyle__advertisement','artstyle__container__icon','artstyle__disabled-embed','container__title__icon',]}), dict(
attrs={
'class': [
'article-footer__sharing',
'artstyle__editorial-tips',
'artstyle__advertisement',
'artstyle__container__icon',
'artstyle__disabled-embed',
'container__title__icon',
]
}
),
dict(attrs={'data-element-id': ['article-element-authors']}), dict(attrs={'data-element-id': ['article-element-authors']}),
dict(name=['script', 'noscript', 'style']), dict(name=['script', 'noscript', 'style']),
] ]
@ -22,7 +34,9 @@ class Volkskrant(BasicNewsRecipe):
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
def parse_index(self): def parse_index(self):
soup = self.index_to_soup('https://www.volkskrant.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4())) soup = self.index_to_soup(
'https://www.volkskrant.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4())
)
containers = soup.findAll('section', attrs={'class': 'section--horizontal'}) containers = soup.findAll('section', attrs={'class': 'section--horizontal'})
sections = [] sections = []
for container in containers: for container in containers:
@ -37,9 +51,18 @@ class Volkskrant(BasicNewsRecipe):
if '/editie/' not in url: if '/editie/' not in url:
continue continue
header = a.find('header') header = a.find('header')
teaser_label = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__label'})).strip() teaser_label = self.tag_to_string(
teaser_sublabel = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__sublabel'})).strip() header.find('h4').find('span', attrs={'class': 'teaser__label'})
teaser_title = self.tag_to_string(header.find('h3').find('span', attrs={'class': 'teaser__title__value--short'})).strip() ).strip()
teaser_sublabel = self.tag_to_string(
header.find('h4'
).find('span', attrs={'class': 'teaser__sublabel'})
).strip()
teaser_title = self.tag_to_string(
header.find('h3').find(
'span', attrs={'class': 'teaser__title__value--short'}
)
).strip()
if teaser_label.lower() == "podcast": if teaser_label.lower() == "podcast":
continue continue
parts = [] parts = []
@ -52,11 +75,15 @@ class Volkskrant(BasicNewsRecipe):
article_title = ' \u2022 '.join(parts) article_title = ' \u2022 '.join(parts)
pubdate = '' pubdate = ''
description = '' description = ''
articles.append(dict(title=article_title, articles.append(
url=url, dict(
date=pubdate, title=article_title,
description=description, url=url,
content='')) date=pubdate,
description=description,
content=''
)
)
sections.append((section_title, articles)) sections.append((section_title, articles))
return sections return sections