This commit is contained in:
Kovid Goyal 2022-06-30 21:35:25 +05:30
commit 248fbd3192
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 179 additions and 136 deletions

View File

@ -1,55 +1,122 @@
__license__ = 'GPL v3' #!/usr/bin/env python
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>' from calibre.web.feeds.recipes import BasicNewsRecipe
''' import datetime
nrc.nl import json
''' from time import sleep
from mechanize import Request
from contextlib import closing
import re
from calibre.web.feeds.news import BasicNewsRecipe class NRC(BasicNewsRecipe):
from calibre.ebooks.BeautifulSoup import Tag
def new_tag(soup, name, attrs=()):
impl = getattr(soup, 'new_tag', None)
if impl is not None:
return impl(name, attrs=dict(attrs))
return Tag(soup, name, attrs=attrs or None)
class Pagina12(BasicNewsRecipe):
title = 'NRC' title = 'NRC'
__author__ = 'Darko Miletic' __author__ = 'Cristi Ghera'
description = 'News from Netherlands' max_articles_per_feed = 100
publisher = 'nrc.nl' description = 'NRC - Nieuws, achtergronden en onderzoeksjournalistiek'
category = 'news, politics, Netherlands' needs_subscription = False
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'nl' language = 'nl'
country = 'NL' country = 'NL'
remove_empty_feeds = True category = 'news, politics, Netherlands'
masthead_url = 'http://www.nrc.nl/nrc.nl/images/logo_nrc.png' resolve_internal_links = True
remove_tags_before = {'class':'article__header-and-content'}
keep_only_tags = [ remove_tags_after = {'class':'article__header-and-content'}
dict(name=['h1', 'figure']), remove_tags = [
dict(attrs={'class': ['intro', 'byline']}), dict(attrs={'class':['article__footer',
dict(attrs={'class': lambda x: x and 'article__content' in x}), 'lees-ook',
'luister-naar',
'print-layout-warning',
'newslettersignup',
'article__byline',
'article__published-in',
'article__featured-image__caption__producer',
'metabox',]}),
dict(name=['script', 'noscript', 'style']),
] ]
remove_attributes = ['style'] remove_attributes = ["class", "id", "name", "style"]
encoding = 'utf-8'
feeds = ['http://www.nrc.nl/rss/'] no_stylesheets = True
ignore_duplicate_articles = {'url'}
delay = 0.3
touchscreen = True
frontpage = None
title_regexp = None
@staticmethod
def _monthly_list_url(date, fmt="%Y/%m/"):
return "https://www.nrc.nl/de/data/NH/" + date.strftime(fmt)
def _clean_article_title(self, title):
if not title:
return title
if self.title_regexp is None:
self.title_regexp = re.compile(r'<span class="keyword">([^<]+)</span>\s*')
return self.title_regexp.sub(r"\1 ", title)
def parse_index(self):
sections = []
today = datetime.date.today()
headers = {
'X-Requested-With': 'XMLHttpRequest',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'DNT': '1',
}
monthly_list_urls = [
self._monthly_list_url(today),
self._monthly_list_url(datetime.date(today.year, today.month, 1) - datetime.timedelta(days=1))
]
issue_url = None
issue_date = None
for monthly_list_url in monthly_list_urls:
with closing(self.browser.open(Request(monthly_list_url, None, headers))) as r:
issues = json.loads(r.read())
if len(issues) > 0:
issue_date = datetime.datetime.strptime(issues[0]["published_at"], "%Y-%m-%dT%H:%M:%SZ")
issue_url = self._monthly_list_url(issue_date, "%Y/%m/%d/")
self.frontpage = issues[0]["frontpage"]
break
if issue_url is None:
return []
with closing(self.browser.open(Request(issue_url, None, headers))) as r:
edition = json.loads(r.read())
documents = {}
for headline in edition["paperheadlines"]:
item = headline["item"]
documents[headline["document_id"]] = dict(
url=item["full_url"],
headline=self._clean_article_title(item["headline"])
)
for section in edition["sections"]:
articles = []
for doc in section["document_ids"]:
if doc not in documents:
self.log.warn('Document not found:', doc)
continue
articles.append(dict(
title=documents[doc]["headline"],
url=documents[doc]["url"]
))
sections.append((
section["name"],
articles
))
return sections
def preprocess_html(self, soup): def preprocess_html(self, soup):
src = None for tag in soup():
for meta in soup.findAll('meta', itemprop='image', content=True): if tag.name == 'img':
src = meta['content'] if tag.has_attr('data-src-medium'):
break tag['src'] = tag['data-src-medium'].split("|")[0]
if src is not None: elif tag.has_attr('data-src'):
div = soup.find( tag['src'] = tag['data-src'].split("|")[0]
'div', attrs={'class': lambda x: x and 'featured-img' in x}) if tag['src'].startswith('//'):
if div is not None: tag['src'] = 'https:' + tag['src']
img = new_tag(soup, 'img') elif tag['src'].startswith('/'):
img['src'] = src tag['src'] = 'https://www.nrc.nl' + tag['src']
div.append(img) if self.browser.cookiejar:
self.browser.cookiejar.clear()
return soup return soup
def get_cover_url(self):
return self.frontpage

View File

@ -1,93 +1,69 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai #!/usr/bin/env python
from __future__ import with_statement from calibre.web.feeds.recipes import BasicNewsRecipe
import uuid
__license__ = 'GPL v3' class Volkskrant(BasicNewsRecipe):
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' title = 'Volkskrant'
__docformat__ = 'restructuredtext en' __author__ = 'Cristi Ghera'
'''
Modified by Tony Stegall
on 10/10/10 to include function to grab print version of articles
'''
from datetime import date
from calibre.web.feeds.news import BasicNewsRecipe
'''
added by Tony Stegall
'''
#######################################################
from calibre.ptempfile import PersistentTemporaryFile
#######################################################
class AdvancedUserRecipe1249039563(BasicNewsRecipe):
title = u'De Volkskrant'
__author__ = 'acidzebra'
oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
description = 'Volkskrant - Nieuws, achtergronden en columns'
needs_subscription = False
resolve_internal_links = True
remove_tags_before = dict(id='main-content')
remove_tags_after = dict(id='main-content')
remove_tags = [
dict(attrs={'class':['article-footer__sharing', 'artstyle__editorial-tips', 'artstyle__advertisement','artstyle__container__icon','artstyle__disabled-embed','container__title__icon',]}),
dict(attrs={'data-element-id': ['article-element-authors']}),
dict(name=['script', 'noscript', 'style']),
]
remove_attributes = ["class", "id", "name", "style"]
encoding = 'utf-8'
no_stylesheets = True no_stylesheets = True
language = 'nl' ignore_duplicate_articles = {'url'}
def parse_index(self):
soup = self.index_to_soup('https://www.volkskrant.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4()))
containers = soup.findAll('section', attrs={'class': 'section--horizontal'})
sections = []
for container in containers:
section_title = self.tag_to_string(container.find('h2')).strip()
articles = []
for art in container.findAll('article'):
a = art.find('a')
url = a['href']
if url[0] == '/':
url = 'https://www.volkskrant.nl' + url
if '/editie/' not in url:
continue
header = a.find('header')
teaser_label = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__label'})).strip()
teaser_sublabel = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__sublabel'})).strip()
teaser_title = self.tag_to_string(header.find('h3').find('span', attrs={'class': 'teaser__title__value--short'})).strip()
if teaser_label.lower() == "podcast":
continue
parts = []
if teaser_label:
parts.append(teaser_label.upper())
if teaser_sublabel:
parts.append(teaser_sublabel)
if teaser_title:
parts.append(teaser_title)
article_title = ' \u2022 '.join(parts)
pubdate = ''
description = ''
articles.append(dict(title=article_title,
url=url,
date=pubdate,
description=description,
content=''))
sections.append((section_title, articles))
return sections
extra_css = ''' def preprocess_html(self, soup):
body{font-family:Arial,Helvetica,sans-serif;font-size:small;} for tag in soup():
h1{font-size:large;} if tag.name == 'img':
''' if tag['src'][0] == '/':
''' tag['src'] = 'https://www.volkskrant.nl' + tag['src']
Change Log: return soup
Date: 10/10/10 - Modified code to include obfuscated to get the print version
Author: Tony Stegall
Date: 01/01/11 - Modified for better results around December/January.
Author: Martin Tarenskeen
'''
# #########################################################################
temp_files = []
articles_are_obfuscated = True
def get_obfuscated_article(self, url):
br = self.browser.clone_browser()
br.open(url)
year = date.today().year
try:
response = br.follow_link(
url_regex='.*?(%d)(\\/)(article)(\\/)(print)(\\/)' % year, nr=0)
html = response.read()
except:
year = year - 1
try:
response = br.follow_link(
url_regex='.*?(%d)(\\/)(article)(\\/)(print)(\\/)' % year, nr=0)
html = response.read()
except:
response = br.open(url)
html = response.read()
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name
# #########################################################################
'''
Change Log:
Date: 10/15/2010
Feeds updated by Martin Tarenskeen
Date: 09/09/2012
Feeds updated by Eric Lammerts
'''
feeds = [
(u'Nieuws', u'http://www.volkskrant.nl/nieuws/rss.xml'),
(u'Binnenland', u'http://www.volkskrant.nl/nieuws/binnenland/rss.xml'),
(u'Buitenland', u'http://www.volkskrant.nl/buitenland/rss.xml'),
(u'Economie', u'http://www.volkskrant.nl/nieuws/economie/rss.xml'),
(u'Politiek', u'http://www.volkskrant.nl/politiek/rss.xml'),
(u'Sport', u'http://www.volkskrant.nl/sport/rss.xml'),
(u'Cultuur', u'http://www.volkskrant.nl/nieuws/cultuur/rss.xml'),
(u'Gezondheid & wetenschap',
u'http://www.volkskrant.nl/nieuws/gezondheid--wetenschap/rss.xml'),
(u'Tech & Media', u'http://www.volkskrant.nl/tech-media/rss.xml'),
(u'Reizen', u'http://www.volkskrant.nl/nieuws/reizen/rss.xml'),
(u'Opinie', u'http://www.volkskrant.nl/opinie/rss.xml'),
(u'Opmerkelijk', u'http://www.volkskrant.nl/nieuws/opmerkelijk/rss.xml')]