mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge branch 'master' of https://github.com/itsirc/calibre
This commit is contained in:
commit
248fbd3192
@ -1,55 +1,122 @@
|
|||||||
__license__ = 'GPL v3'
|
#!/usr/bin/env python
|
||||||
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
'''
|
import datetime
|
||||||
nrc.nl
|
import json
|
||||||
'''
|
from time import sleep
|
||||||
|
from mechanize import Request
|
||||||
|
from contextlib import closing
|
||||||
|
import re
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
class NRC(BasicNewsRecipe):
|
||||||
from calibre.ebooks.BeautifulSoup import Tag
|
|
||||||
|
|
||||||
|
|
||||||
def new_tag(soup, name, attrs=()):
|
|
||||||
impl = getattr(soup, 'new_tag', None)
|
|
||||||
if impl is not None:
|
|
||||||
return impl(name, attrs=dict(attrs))
|
|
||||||
return Tag(soup, name, attrs=attrs or None)
|
|
||||||
|
|
||||||
|
|
||||||
class Pagina12(BasicNewsRecipe):
|
|
||||||
title = 'NRC'
|
title = 'NRC'
|
||||||
__author__ = 'Darko Miletic'
|
__author__ = 'Cristi Ghera'
|
||||||
description = 'News from Netherlands'
|
max_articles_per_feed = 100
|
||||||
publisher = 'nrc.nl'
|
description = 'NRC - Nieuws, achtergronden en onderzoeksjournalistiek'
|
||||||
category = 'news, politics, Netherlands'
|
needs_subscription = False
|
||||||
oldest_article = 2
|
|
||||||
max_articles_per_feed = 200
|
|
||||||
no_stylesheets = True
|
|
||||||
encoding = 'utf8'
|
|
||||||
use_embedded_content = False
|
|
||||||
language = 'nl'
|
language = 'nl'
|
||||||
country = 'NL'
|
country = 'NL'
|
||||||
remove_empty_feeds = True
|
category = 'news, politics, Netherlands'
|
||||||
masthead_url = 'http://www.nrc.nl/nrc.nl/images/logo_nrc.png'
|
resolve_internal_links = True
|
||||||
|
remove_tags_before = {'class':'article__header-and-content'}
|
||||||
keep_only_tags = [
|
remove_tags_after = {'class':'article__header-and-content'}
|
||||||
dict(name=['h1', 'figure']),
|
remove_tags = [
|
||||||
dict(attrs={'class': ['intro', 'byline']}),
|
dict(attrs={'class':['article__footer',
|
||||||
dict(attrs={'class': lambda x: x and 'article__content' in x}),
|
'lees-ook',
|
||||||
|
'luister-naar',
|
||||||
|
'print-layout-warning',
|
||||||
|
'newslettersignup',
|
||||||
|
'article__byline',
|
||||||
|
'article__published-in',
|
||||||
|
'article__featured-image__caption__producer',
|
||||||
|
'metabox',]}),
|
||||||
|
dict(name=['script', 'noscript', 'style']),
|
||||||
]
|
]
|
||||||
remove_attributes = ['style']
|
remove_attributes = ["class", "id", "name", "style"]
|
||||||
|
encoding = 'utf-8'
|
||||||
feeds = ['http://www.nrc.nl/rss/']
|
no_stylesheets = True
|
||||||
|
ignore_duplicate_articles = {'url'}
|
||||||
|
delay = 0.3
|
||||||
|
|
||||||
|
touchscreen = True
|
||||||
|
|
||||||
|
frontpage = None
|
||||||
|
|
||||||
|
title_regexp = None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _monthly_list_url(date, fmt="%Y/%m/"):
|
||||||
|
return "https://www.nrc.nl/de/data/NH/" + date.strftime(fmt)
|
||||||
|
|
||||||
|
def _clean_article_title(self, title):
|
||||||
|
if not title:
|
||||||
|
return title
|
||||||
|
if self.title_regexp is None:
|
||||||
|
self.title_regexp = re.compile(r'<span class="keyword">([^<]+)</span>\s*')
|
||||||
|
return self.title_regexp.sub(r"\1 ", title)
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
sections = []
|
||||||
|
today = datetime.date.today()
|
||||||
|
headers = {
|
||||||
|
'X-Requested-With': 'XMLHttpRequest',
|
||||||
|
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
||||||
|
'DNT': '1',
|
||||||
|
}
|
||||||
|
monthly_list_urls = [
|
||||||
|
self._monthly_list_url(today),
|
||||||
|
self._monthly_list_url(datetime.date(today.year, today.month, 1) - datetime.timedelta(days=1))
|
||||||
|
]
|
||||||
|
issue_url = None
|
||||||
|
issue_date = None
|
||||||
|
for monthly_list_url in monthly_list_urls:
|
||||||
|
with closing(self.browser.open(Request(monthly_list_url, None, headers))) as r:
|
||||||
|
issues = json.loads(r.read())
|
||||||
|
if len(issues) > 0:
|
||||||
|
issue_date = datetime.datetime.strptime(issues[0]["published_at"], "%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
issue_url = self._monthly_list_url(issue_date, "%Y/%m/%d/")
|
||||||
|
self.frontpage = issues[0]["frontpage"]
|
||||||
|
break
|
||||||
|
if issue_url is None:
|
||||||
|
return []
|
||||||
|
with closing(self.browser.open(Request(issue_url, None, headers))) as r:
|
||||||
|
edition = json.loads(r.read())
|
||||||
|
documents = {}
|
||||||
|
for headline in edition["paperheadlines"]:
|
||||||
|
item = headline["item"]
|
||||||
|
documents[headline["document_id"]] = dict(
|
||||||
|
url=item["full_url"],
|
||||||
|
headline=self._clean_article_title(item["headline"])
|
||||||
|
)
|
||||||
|
for section in edition["sections"]:
|
||||||
|
articles = []
|
||||||
|
for doc in section["document_ids"]:
|
||||||
|
if doc not in documents:
|
||||||
|
self.log.warn('Document not found:', doc)
|
||||||
|
continue
|
||||||
|
articles.append(dict(
|
||||||
|
title=documents[doc]["headline"],
|
||||||
|
url=documents[doc]["url"]
|
||||||
|
))
|
||||||
|
sections.append((
|
||||||
|
section["name"],
|
||||||
|
articles
|
||||||
|
))
|
||||||
|
return sections
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
src = None
|
for tag in soup():
|
||||||
for meta in soup.findAll('meta', itemprop='image', content=True):
|
if tag.name == 'img':
|
||||||
src = meta['content']
|
if tag.has_attr('data-src-medium'):
|
||||||
break
|
tag['src'] = tag['data-src-medium'].split("|")[0]
|
||||||
if src is not None:
|
elif tag.has_attr('data-src'):
|
||||||
div = soup.find(
|
tag['src'] = tag['data-src'].split("|")[0]
|
||||||
'div', attrs={'class': lambda x: x and 'featured-img' in x})
|
if tag['src'].startswith('//'):
|
||||||
if div is not None:
|
tag['src'] = 'https:' + tag['src']
|
||||||
img = new_tag(soup, 'img')
|
elif tag['src'].startswith('/'):
|
||||||
img['src'] = src
|
tag['src'] = 'https://www.nrc.nl' + tag['src']
|
||||||
div.append(img)
|
if self.browser.cookiejar:
|
||||||
|
self.browser.cookiejar.clear()
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
return self.frontpage
|
@ -1,93 +1,69 @@
|
|||||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
#!/usr/bin/env python
|
||||||
from __future__ import with_statement
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
import uuid
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
class Volkskrant(BasicNewsRecipe):
|
||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
title = 'Volkskrant'
|
||||||
__docformat__ = 'restructuredtext en'
|
__author__ = 'Cristi Ghera'
|
||||||
|
|
||||||
'''
|
|
||||||
Modified by Tony Stegall
|
|
||||||
on 10/10/10 to include function to grab print version of articles
|
|
||||||
'''
|
|
||||||
from datetime import date
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
'''
|
|
||||||
added by Tony Stegall
|
|
||||||
'''
|
|
||||||
#######################################################
|
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
|
||||||
#######################################################
|
|
||||||
|
|
||||||
|
|
||||||
class AdvancedUserRecipe1249039563(BasicNewsRecipe):
|
|
||||||
title = u'De Volkskrant'
|
|
||||||
__author__ = 'acidzebra'
|
|
||||||
oldest_article = 7
|
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
description = 'Volkskrant - Nieuws, achtergronden en columns'
|
||||||
|
needs_subscription = False
|
||||||
|
resolve_internal_links = True
|
||||||
|
remove_tags_before = dict(id='main-content')
|
||||||
|
remove_tags_after = dict(id='main-content')
|
||||||
|
remove_tags = [
|
||||||
|
dict(attrs={'class':['article-footer__sharing', 'artstyle__editorial-tips', 'artstyle__advertisement','artstyle__container__icon','artstyle__disabled-embed','container__title__icon',]}),
|
||||||
|
dict(attrs={'data-element-id': ['article-element-authors']}),
|
||||||
|
dict(name=['script', 'noscript', 'style']),
|
||||||
|
]
|
||||||
|
remove_attributes = ["class", "id", "name", "style"]
|
||||||
|
encoding = 'utf-8'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
language = 'nl'
|
ignore_duplicate_articles = {'url'}
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup('https://www.volkskrant.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4()))
|
||||||
|
containers = soup.findAll('section', attrs={'class': 'section--horizontal'})
|
||||||
|
sections = []
|
||||||
|
for container in containers:
|
||||||
|
section_title = self.tag_to_string(container.find('h2')).strip()
|
||||||
|
articles = []
|
||||||
|
|
||||||
|
for art in container.findAll('article'):
|
||||||
|
a = art.find('a')
|
||||||
|
url = a['href']
|
||||||
|
if url[0] == '/':
|
||||||
|
url = 'https://www.volkskrant.nl' + url
|
||||||
|
if '/editie/' not in url:
|
||||||
|
continue
|
||||||
|
header = a.find('header')
|
||||||
|
teaser_label = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__label'})).strip()
|
||||||
|
teaser_sublabel = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__sublabel'})).strip()
|
||||||
|
teaser_title = self.tag_to_string(header.find('h3').find('span', attrs={'class': 'teaser__title__value--short'})).strip()
|
||||||
|
if teaser_label.lower() == "podcast":
|
||||||
|
continue
|
||||||
|
parts = []
|
||||||
|
if teaser_label:
|
||||||
|
parts.append(teaser_label.upper())
|
||||||
|
if teaser_sublabel:
|
||||||
|
parts.append(teaser_sublabel)
|
||||||
|
if teaser_title:
|
||||||
|
parts.append(teaser_title)
|
||||||
|
article_title = ' \u2022 '.join(parts)
|
||||||
|
pubdate = ''
|
||||||
|
description = ''
|
||||||
|
articles.append(dict(title=article_title,
|
||||||
|
url=url,
|
||||||
|
date=pubdate,
|
||||||
|
description=description,
|
||||||
|
content=''))
|
||||||
|
|
||||||
|
sections.append((section_title, articles))
|
||||||
|
return sections
|
||||||
|
|
||||||
extra_css = '''
|
def preprocess_html(self, soup):
|
||||||
body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
for tag in soup():
|
||||||
h1{font-size:large;}
|
if tag.name == 'img':
|
||||||
'''
|
if tag['src'][0] == '/':
|
||||||
'''
|
tag['src'] = 'https://www.volkskrant.nl' + tag['src']
|
||||||
Change Log:
|
return soup
|
||||||
Date: 10/10/10 - Modified code to include obfuscated to get the print version
|
|
||||||
Author: Tony Stegall
|
|
||||||
|
|
||||||
Date: 01/01/11 - Modified for better results around December/January.
|
|
||||||
Author: Martin Tarenskeen
|
|
||||||
'''
|
|
||||||
# #########################################################################
|
|
||||||
temp_files = []
|
|
||||||
articles_are_obfuscated = True
|
|
||||||
|
|
||||||
def get_obfuscated_article(self, url):
|
|
||||||
br = self.browser.clone_browser()
|
|
||||||
br.open(url)
|
|
||||||
year = date.today().year
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = br.follow_link(
|
|
||||||
url_regex='.*?(%d)(\\/)(article)(\\/)(print)(\\/)' % year, nr=0)
|
|
||||||
html = response.read()
|
|
||||||
except:
|
|
||||||
year = year - 1
|
|
||||||
try:
|
|
||||||
response = br.follow_link(
|
|
||||||
url_regex='.*?(%d)(\\/)(article)(\\/)(print)(\\/)' % year, nr=0)
|
|
||||||
html = response.read()
|
|
||||||
except:
|
|
||||||
response = br.open(url)
|
|
||||||
html = response.read()
|
|
||||||
|
|
||||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
|
||||||
self.temp_files[-1].write(html)
|
|
||||||
self.temp_files[-1].close()
|
|
||||||
return self.temp_files[-1].name
|
|
||||||
|
|
||||||
# #########################################################################
|
|
||||||
|
|
||||||
'''
|
|
||||||
Change Log:
|
|
||||||
Date: 10/15/2010
|
|
||||||
Feeds updated by Martin Tarenskeen
|
|
||||||
Date: 09/09/2012
|
|
||||||
Feeds updated by Eric Lammerts
|
|
||||||
'''
|
|
||||||
|
|
||||||
feeds = [
|
|
||||||
(u'Nieuws', u'http://www.volkskrant.nl/nieuws/rss.xml'),
|
|
||||||
(u'Binnenland', u'http://www.volkskrant.nl/nieuws/binnenland/rss.xml'),
|
|
||||||
(u'Buitenland', u'http://www.volkskrant.nl/buitenland/rss.xml'),
|
|
||||||
(u'Economie', u'http://www.volkskrant.nl/nieuws/economie/rss.xml'),
|
|
||||||
(u'Politiek', u'http://www.volkskrant.nl/politiek/rss.xml'),
|
|
||||||
(u'Sport', u'http://www.volkskrant.nl/sport/rss.xml'),
|
|
||||||
(u'Cultuur', u'http://www.volkskrant.nl/nieuws/cultuur/rss.xml'),
|
|
||||||
(u'Gezondheid & wetenschap',
|
|
||||||
u'http://www.volkskrant.nl/nieuws/gezondheid--wetenschap/rss.xml'),
|
|
||||||
(u'Tech & Media', u'http://www.volkskrant.nl/tech-media/rss.xml'),
|
|
||||||
(u'Reizen', u'http://www.volkskrant.nl/nieuws/reizen/rss.xml'),
|
|
||||||
(u'Opinie', u'http://www.volkskrant.nl/opinie/rss.xml'),
|
|
||||||
(u'Opmerkelijk', u'http://www.volkskrant.nl/nieuws/opmerkelijk/rss.xml')]
|
|
Loading…
x
Reference in New Issue
Block a user