This commit is contained in:
Kovid Goyal 2022-06-30 21:36:14 +05:30
parent 248fbd3192
commit 65c55a6d44
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 86 additions and 46 deletions

View File

@ -2,11 +2,11 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
import datetime
import json
from time import sleep
from mechanize import Request
from contextlib import closing
import re
class NRC(BasicNewsRecipe):
title = 'NRC'
__author__ = 'Cristi Ghera'
@ -17,10 +17,13 @@ class NRC(BasicNewsRecipe):
country = 'NL'
category = 'news, politics, Netherlands'
resolve_internal_links = True
remove_tags_before = {'class':'article__header-and-content'}
remove_tags_after = {'class':'article__header-and-content'}
remove_tags_before = {'class': 'article__header-and-content'}
remove_tags_after = {'class': 'article__header-and-content'}
remove_tags = [
dict(attrs={'class':['article__footer',
dict(
attrs={
'class': [
'article__footer',
'lees-ook',
'luister-naar',
'print-layout-warning',
@ -28,7 +31,10 @@ class NRC(BasicNewsRecipe):
'article__byline',
'article__published-in',
'article__featured-image__caption__producer',
'metabox',]}),
'metabox',
]
}
),
dict(name=['script', 'noscript', 'style']),
]
remove_attributes = ["class", "id", "name", "style"]
@ -51,7 +57,9 @@ class NRC(BasicNewsRecipe):
if not title:
return title
if self.title_regexp is None:
self.title_regexp = re.compile(r'<span class="keyword">([^<]+)</span>\s*')
self.title_regexp = re.compile(
r'<span class="keyword">([^<]+)</span>\s*'
)
return self.title_regexp.sub(r"\1 ", title)
def parse_index(self):
@ -64,15 +72,22 @@ class NRC(BasicNewsRecipe):
}
monthly_list_urls = [
self._monthly_list_url(today),
self._monthly_list_url(datetime.date(today.year, today.month, 1) - datetime.timedelta(days=1))
self._monthly_list_url(
datetime.date(today.year, today.month, 1) -
datetime.timedelta(days=1)
)
]
issue_url = None
issue_date = None
for monthly_list_url in monthly_list_urls:
with closing(self.browser.open(Request(monthly_list_url, None, headers))) as r:
with closing(
self.browser.open(Request(monthly_list_url, None, headers))
) as r:
issues = json.loads(r.read())
if len(issues) > 0:
issue_date = datetime.datetime.strptime(issues[0]["published_at"], "%Y-%m-%dT%H:%M:%SZ")
issue_date = datetime.datetime.strptime(
issues[0]["published_at"], "%Y-%m-%dT%H:%M:%SZ"
)
issue_url = self._monthly_list_url(issue_date, "%Y/%m/%d/")
self.frontpage = issues[0]["frontpage"]
break
@ -93,14 +108,12 @@ class NRC(BasicNewsRecipe):
if doc not in documents:
self.log.warn('Document not found:', doc)
continue
articles.append(dict(
title=documents[doc]["headline"],
url=documents[doc]["url"]
))
sections.append((
section["name"],
articles
))
articles.append(
dict(
title=documents[doc]["headline"], url=documents[doc]["url"]
)
)
sections.append((section["name"], articles))
return sections
def preprocess_html(self, soup):

View File

@ -2,6 +2,7 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
import uuid
class Volkskrant(BasicNewsRecipe):
title = 'Volkskrant'
__author__ = 'Cristi Ghera'
@ -12,7 +13,18 @@ class Volkskrant(BasicNewsRecipe):
remove_tags_before = dict(id='main-content')
remove_tags_after = dict(id='main-content')
remove_tags = [
dict(attrs={'class':['article-footer__sharing', 'artstyle__editorial-tips', 'artstyle__advertisement','artstyle__container__icon','artstyle__disabled-embed','container__title__icon',]}),
dict(
attrs={
'class': [
'article-footer__sharing',
'artstyle__editorial-tips',
'artstyle__advertisement',
'artstyle__container__icon',
'artstyle__disabled-embed',
'container__title__icon',
]
}
),
dict(attrs={'data-element-id': ['article-element-authors']}),
dict(name=['script', 'noscript', 'style']),
]
@ -22,7 +34,9 @@ class Volkskrant(BasicNewsRecipe):
ignore_duplicate_articles = {'url'}
def parse_index(self):
soup = self.index_to_soup('https://www.volkskrant.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4()))
soup = self.index_to_soup(
'https://www.volkskrant.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4())
)
containers = soup.findAll('section', attrs={'class': 'section--horizontal'})
sections = []
for container in containers:
@ -37,9 +51,18 @@ class Volkskrant(BasicNewsRecipe):
if '/editie/' not in url:
continue
header = a.find('header')
teaser_label = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__label'})).strip()
teaser_sublabel = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__sublabel'})).strip()
teaser_title = self.tag_to_string(header.find('h3').find('span', attrs={'class': 'teaser__title__value--short'})).strip()
teaser_label = self.tag_to_string(
header.find('h4').find('span', attrs={'class': 'teaser__label'})
).strip()
teaser_sublabel = self.tag_to_string(
header.find('h4'
).find('span', attrs={'class': 'teaser__sublabel'})
).strip()
teaser_title = self.tag_to_string(
header.find('h3').find(
'span', attrs={'class': 'teaser__title__value--short'}
)
).strip()
if teaser_label.lower() == "podcast":
continue
parts = []
@ -52,11 +75,15 @@ class Volkskrant(BasicNewsRecipe):
article_title = ' \u2022 '.join(parts)
pubdate = ''
description = ''
articles.append(dict(title=article_title,
articles.append(
dict(
title=article_title,
url=url,
date=pubdate,
description=description,
content=''))
content=''
)
)
sections.append((section_title, articles))
return sections