This commit is contained in:
Kovid Goyal 2022-06-30 21:35:25 +05:30
commit 248fbd3192
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 179 additions and 136 deletions

View File

@ -1,55 +1,122 @@
__license__ = 'GPL v3'
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
'''
nrc.nl
'''
#!/usr/bin/env python
from calibre.web.feeds.recipes import BasicNewsRecipe
import datetime
import json
from time import sleep
from mechanize import Request
from contextlib import closing
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
def new_tag(soup, name, attrs=()):
impl = getattr(soup, 'new_tag', None)
if impl is not None:
return impl(name, attrs=dict(attrs))
return Tag(soup, name, attrs=attrs or None)
class Pagina12(BasicNewsRecipe):
class NRC(BasicNewsRecipe):
title = 'NRC'
__author__ = 'Darko Miletic'
description = 'News from Netherlands'
publisher = 'nrc.nl'
category = 'news, politics, Netherlands'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
__author__ = 'Cristi Ghera'
max_articles_per_feed = 100
description = 'NRC - Nieuws, achtergronden en onderzoeksjournalistiek'
needs_subscription = False
language = 'nl'
country = 'NL'
remove_empty_feeds = True
masthead_url = 'http://www.nrc.nl/nrc.nl/images/logo_nrc.png'
keep_only_tags = [
dict(name=['h1', 'figure']),
dict(attrs={'class': ['intro', 'byline']}),
dict(attrs={'class': lambda x: x and 'article__content' in x}),
category = 'news, politics, Netherlands'
resolve_internal_links = True
remove_tags_before = {'class':'article__header-and-content'}
remove_tags_after = {'class':'article__header-and-content'}
remove_tags = [
dict(attrs={'class':['article__footer',
'lees-ook',
'luister-naar',
'print-layout-warning',
'newslettersignup',
'article__byline',
'article__published-in',
'article__featured-image__caption__producer',
'metabox',]}),
dict(name=['script', 'noscript', 'style']),
]
remove_attributes = ['style']
remove_attributes = ["class", "id", "name", "style"]
encoding = 'utf-8'
no_stylesheets = True
ignore_duplicate_articles = {'url'}
delay = 0.3
feeds = ['http://www.nrc.nl/rss/']
touchscreen = True
frontpage = None
title_regexp = None
@staticmethod
def _monthly_list_url(date, fmt="%Y/%m/"):
return "https://www.nrc.nl/de/data/NH/" + date.strftime(fmt)
def _clean_article_title(self, title):
if not title:
return title
if self.title_regexp is None:
self.title_regexp = re.compile(r'<span class="keyword">([^<]+)</span>\s*')
return self.title_regexp.sub(r"\1 ", title)
def parse_index(self):
sections = []
today = datetime.date.today()
headers = {
'X-Requested-With': 'XMLHttpRequest',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'DNT': '1',
}
monthly_list_urls = [
self._monthly_list_url(today),
self._monthly_list_url(datetime.date(today.year, today.month, 1) - datetime.timedelta(days=1))
]
issue_url = None
issue_date = None
for monthly_list_url in monthly_list_urls:
with closing(self.browser.open(Request(monthly_list_url, None, headers))) as r:
issues = json.loads(r.read())
if len(issues) > 0:
issue_date = datetime.datetime.strptime(issues[0]["published_at"], "%Y-%m-%dT%H:%M:%SZ")
issue_url = self._monthly_list_url(issue_date, "%Y/%m/%d/")
self.frontpage = issues[0]["frontpage"]
break
if issue_url is None:
return []
with closing(self.browser.open(Request(issue_url, None, headers))) as r:
edition = json.loads(r.read())
documents = {}
for headline in edition["paperheadlines"]:
item = headline["item"]
documents[headline["document_id"]] = dict(
url=item["full_url"],
headline=self._clean_article_title(item["headline"])
)
for section in edition["sections"]:
articles = []
for doc in section["document_ids"]:
if doc not in documents:
self.log.warn('Document not found:', doc)
continue
articles.append(dict(
title=documents[doc]["headline"],
url=documents[doc]["url"]
))
sections.append((
section["name"],
articles
))
return sections
def preprocess_html(self, soup):
src = None
for meta in soup.findAll('meta', itemprop='image', content=True):
src = meta['content']
break
if src is not None:
div = soup.find(
'div', attrs={'class': lambda x: x and 'featured-img' in x})
if div is not None:
img = new_tag(soup, 'img')
img['src'] = src
div.append(img)
for tag in soup():
if tag.name == 'img':
if tag.has_attr('data-src-medium'):
tag['src'] = tag['data-src-medium'].split("|")[0]
elif tag.has_attr('data-src'):
tag['src'] = tag['data-src'].split("|")[0]
if tag['src'].startswith('//'):
tag['src'] = 'https:' + tag['src']
elif tag['src'].startswith('/'):
tag['src'] = 'https://www.nrc.nl' + tag['src']
if self.browser.cookiejar:
self.browser.cookiejar.clear()
return soup
def get_cover_url(self):
return self.frontpage

View File

@ -1,93 +1,69 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
#!/usr/bin/env python
from calibre.web.feeds.recipes import BasicNewsRecipe
import uuid
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'''
Modified by Tony Stegall
on 10/10/10 to include function to grab print version of articles
'''
from datetime import date
from calibre.web.feeds.news import BasicNewsRecipe
'''
added by Tony Stegall
'''
#######################################################
from calibre.ptempfile import PersistentTemporaryFile
#######################################################
class AdvancedUserRecipe1249039563(BasicNewsRecipe):
title = u'De Volkskrant'
__author__ = 'acidzebra'
oldest_article = 7
class Volkskrant(BasicNewsRecipe):
title = 'Volkskrant'
__author__ = 'Cristi Ghera'
max_articles_per_feed = 100
description = 'Volkskrant - Nieuws, achtergronden en columns'
needs_subscription = False
resolve_internal_links = True
remove_tags_before = dict(id='main-content')
remove_tags_after = dict(id='main-content')
remove_tags = [
dict(attrs={'class':['article-footer__sharing', 'artstyle__editorial-tips', 'artstyle__advertisement','artstyle__container__icon','artstyle__disabled-embed','container__title__icon',]}),
dict(attrs={'data-element-id': ['article-element-authors']}),
dict(name=['script', 'noscript', 'style']),
]
remove_attributes = ["class", "id", "name", "style"]
encoding = 'utf-8'
no_stylesheets = True
language = 'nl'
ignore_duplicate_articles = {'url'}
extra_css = '''
body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
h1{font-size:large;}
'''
'''
Change Log:
Date: 10/10/10 - Modified code to include obfuscated to get the print version
Author: Tony Stegall
def parse_index(self):
soup = self.index_to_soup('https://www.volkskrant.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4()))
containers = soup.findAll('section', attrs={'class': 'section--horizontal'})
sections = []
for container in containers:
section_title = self.tag_to_string(container.find('h2')).strip()
articles = []
Date: 01/01/11 - Modified for better results around December/January.
Author: Martin Tarenskeen
'''
# #########################################################################
temp_files = []
articles_are_obfuscated = True
for art in container.findAll('article'):
a = art.find('a')
url = a['href']
if url[0] == '/':
url = 'https://www.volkskrant.nl' + url
if '/editie/' not in url:
continue
header = a.find('header')
teaser_label = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__label'})).strip()
teaser_sublabel = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__sublabel'})).strip()
teaser_title = self.tag_to_string(header.find('h3').find('span', attrs={'class': 'teaser__title__value--short'})).strip()
if teaser_label.lower() == "podcast":
continue
parts = []
if teaser_label:
parts.append(teaser_label.upper())
if teaser_sublabel:
parts.append(teaser_sublabel)
if teaser_title:
parts.append(teaser_title)
article_title = ' \u2022 '.join(parts)
pubdate = ''
description = ''
articles.append(dict(title=article_title,
url=url,
date=pubdate,
description=description,
content=''))
def get_obfuscated_article(self, url):
br = self.browser.clone_browser()
br.open(url)
year = date.today().year
sections.append((section_title, articles))
return sections
try:
response = br.follow_link(
url_regex='.*?(%d)(\\/)(article)(\\/)(print)(\\/)' % year, nr=0)
html = response.read()
except:
year = year - 1
try:
response = br.follow_link(
url_regex='.*?(%d)(\\/)(article)(\\/)(print)(\\/)' % year, nr=0)
html = response.read()
except:
response = br.open(url)
html = response.read()
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name
# #########################################################################
'''
Change Log:
Date: 10/15/2010
Feeds updated by Martin Tarenskeen
Date: 09/09/2012
Feeds updated by Eric Lammerts
'''
feeds = [
(u'Nieuws', u'http://www.volkskrant.nl/nieuws/rss.xml'),
(u'Binnenland', u'http://www.volkskrant.nl/nieuws/binnenland/rss.xml'),
(u'Buitenland', u'http://www.volkskrant.nl/buitenland/rss.xml'),
(u'Economie', u'http://www.volkskrant.nl/nieuws/economie/rss.xml'),
(u'Politiek', u'http://www.volkskrant.nl/politiek/rss.xml'),
(u'Sport', u'http://www.volkskrant.nl/sport/rss.xml'),
(u'Cultuur', u'http://www.volkskrant.nl/nieuws/cultuur/rss.xml'),
(u'Gezondheid & wetenschap',
u'http://www.volkskrant.nl/nieuws/gezondheid--wetenschap/rss.xml'),
(u'Tech & Media', u'http://www.volkskrant.nl/tech-media/rss.xml'),
(u'Reizen', u'http://www.volkskrant.nl/nieuws/reizen/rss.xml'),
(u'Opinie', u'http://www.volkskrant.nl/opinie/rss.xml'),
(u'Opmerkelijk', u'http://www.volkskrant.nl/nieuws/opmerkelijk/rss.xml')]
def preprocess_html(self, soup):
for tag in soup():
if tag.name == 'img':
if tag['src'][0] == '/':
tag['src'] = 'https://www.volkskrant.nl' + tag['src']
return soup