mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
pep8
This commit is contained in:
parent
248fbd3192
commit
65c55a6d44
@ -2,11 +2,11 @@
|
|||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
from time import sleep
|
|
||||||
from mechanize import Request
|
from mechanize import Request
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
class NRC(BasicNewsRecipe):
|
class NRC(BasicNewsRecipe):
|
||||||
title = 'NRC'
|
title = 'NRC'
|
||||||
__author__ = 'Cristi Ghera'
|
__author__ = 'Cristi Ghera'
|
||||||
@ -17,18 +17,24 @@ class NRC(BasicNewsRecipe):
|
|||||||
country = 'NL'
|
country = 'NL'
|
||||||
category = 'news, politics, Netherlands'
|
category = 'news, politics, Netherlands'
|
||||||
resolve_internal_links = True
|
resolve_internal_links = True
|
||||||
remove_tags_before = {'class':'article__header-and-content'}
|
remove_tags_before = {'class': 'article__header-and-content'}
|
||||||
remove_tags_after = {'class':'article__header-and-content'}
|
remove_tags_after = {'class': 'article__header-and-content'}
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(attrs={'class':['article__footer',
|
dict(
|
||||||
'lees-ook',
|
attrs={
|
||||||
'luister-naar',
|
'class': [
|
||||||
'print-layout-warning',
|
'article__footer',
|
||||||
'newslettersignup',
|
'lees-ook',
|
||||||
'article__byline',
|
'luister-naar',
|
||||||
'article__published-in',
|
'print-layout-warning',
|
||||||
'article__featured-image__caption__producer',
|
'newslettersignup',
|
||||||
'metabox',]}),
|
'article__byline',
|
||||||
|
'article__published-in',
|
||||||
|
'article__featured-image__caption__producer',
|
||||||
|
'metabox',
|
||||||
|
]
|
||||||
|
}
|
||||||
|
),
|
||||||
dict(name=['script', 'noscript', 'style']),
|
dict(name=['script', 'noscript', 'style']),
|
||||||
]
|
]
|
||||||
remove_attributes = ["class", "id", "name", "style"]
|
remove_attributes = ["class", "id", "name", "style"]
|
||||||
@ -36,24 +42,26 @@ class NRC(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
ignore_duplicate_articles = {'url'}
|
ignore_duplicate_articles = {'url'}
|
||||||
delay = 0.3
|
delay = 0.3
|
||||||
|
|
||||||
touchscreen = True
|
touchscreen = True
|
||||||
|
|
||||||
frontpage = None
|
frontpage = None
|
||||||
|
|
||||||
title_regexp = None
|
title_regexp = None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _monthly_list_url(date, fmt="%Y/%m/"):
|
def _monthly_list_url(date, fmt="%Y/%m/"):
|
||||||
return "https://www.nrc.nl/de/data/NH/" + date.strftime(fmt)
|
return "https://www.nrc.nl/de/data/NH/" + date.strftime(fmt)
|
||||||
|
|
||||||
def _clean_article_title(self, title):
|
def _clean_article_title(self, title):
|
||||||
if not title:
|
if not title:
|
||||||
return title
|
return title
|
||||||
if self.title_regexp is None:
|
if self.title_regexp is None:
|
||||||
self.title_regexp = re.compile(r'<span class="keyword">([^<]+)</span>\s*')
|
self.title_regexp = re.compile(
|
||||||
|
r'<span class="keyword">([^<]+)</span>\s*'
|
||||||
|
)
|
||||||
return self.title_regexp.sub(r"\1 ", title)
|
return self.title_regexp.sub(r"\1 ", title)
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
sections = []
|
sections = []
|
||||||
today = datetime.date.today()
|
today = datetime.date.today()
|
||||||
@ -64,15 +72,22 @@ class NRC(BasicNewsRecipe):
|
|||||||
}
|
}
|
||||||
monthly_list_urls = [
|
monthly_list_urls = [
|
||||||
self._monthly_list_url(today),
|
self._monthly_list_url(today),
|
||||||
self._monthly_list_url(datetime.date(today.year, today.month, 1) - datetime.timedelta(days=1))
|
self._monthly_list_url(
|
||||||
|
datetime.date(today.year, today.month, 1) -
|
||||||
|
datetime.timedelta(days=1)
|
||||||
|
)
|
||||||
]
|
]
|
||||||
issue_url = None
|
issue_url = None
|
||||||
issue_date = None
|
issue_date = None
|
||||||
for monthly_list_url in monthly_list_urls:
|
for monthly_list_url in monthly_list_urls:
|
||||||
with closing(self.browser.open(Request(monthly_list_url, None, headers))) as r:
|
with closing(
|
||||||
|
self.browser.open(Request(monthly_list_url, None, headers))
|
||||||
|
) as r:
|
||||||
issues = json.loads(r.read())
|
issues = json.loads(r.read())
|
||||||
if len(issues) > 0:
|
if len(issues) > 0:
|
||||||
issue_date = datetime.datetime.strptime(issues[0]["published_at"], "%Y-%m-%dT%H:%M:%SZ")
|
issue_date = datetime.datetime.strptime(
|
||||||
|
issues[0]["published_at"], "%Y-%m-%dT%H:%M:%SZ"
|
||||||
|
)
|
||||||
issue_url = self._monthly_list_url(issue_date, "%Y/%m/%d/")
|
issue_url = self._monthly_list_url(issue_date, "%Y/%m/%d/")
|
||||||
self.frontpage = issues[0]["frontpage"]
|
self.frontpage = issues[0]["frontpage"]
|
||||||
break
|
break
|
||||||
@ -93,14 +108,12 @@ class NRC(BasicNewsRecipe):
|
|||||||
if doc not in documents:
|
if doc not in documents:
|
||||||
self.log.warn('Document not found:', doc)
|
self.log.warn('Document not found:', doc)
|
||||||
continue
|
continue
|
||||||
articles.append(dict(
|
articles.append(
|
||||||
title=documents[doc]["headline"],
|
dict(
|
||||||
url=documents[doc]["url"]
|
title=documents[doc]["headline"], url=documents[doc]["url"]
|
||||||
))
|
)
|
||||||
sections.append((
|
)
|
||||||
section["name"],
|
sections.append((section["name"], articles))
|
||||||
articles
|
|
||||||
))
|
|
||||||
return sections
|
return sections
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
@ -119,4 +132,4 @@ class NRC(BasicNewsRecipe):
|
|||||||
return soup
|
return soup
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
return self.frontpage
|
return self.frontpage
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
|
|
||||||
class Volkskrant(BasicNewsRecipe):
|
class Volkskrant(BasicNewsRecipe):
|
||||||
title = 'Volkskrant'
|
title = 'Volkskrant'
|
||||||
__author__ = 'Cristi Ghera'
|
__author__ = 'Cristi Ghera'
|
||||||
@ -10,9 +11,20 @@ class Volkskrant(BasicNewsRecipe):
|
|||||||
needs_subscription = False
|
needs_subscription = False
|
||||||
resolve_internal_links = True
|
resolve_internal_links = True
|
||||||
remove_tags_before = dict(id='main-content')
|
remove_tags_before = dict(id='main-content')
|
||||||
remove_tags_after = dict(id='main-content')
|
remove_tags_after = dict(id='main-content')
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(attrs={'class':['article-footer__sharing', 'artstyle__editorial-tips', 'artstyle__advertisement','artstyle__container__icon','artstyle__disabled-embed','container__title__icon',]}),
|
dict(
|
||||||
|
attrs={
|
||||||
|
'class': [
|
||||||
|
'article-footer__sharing',
|
||||||
|
'artstyle__editorial-tips',
|
||||||
|
'artstyle__advertisement',
|
||||||
|
'artstyle__container__icon',
|
||||||
|
'artstyle__disabled-embed',
|
||||||
|
'container__title__icon',
|
||||||
|
]
|
||||||
|
}
|
||||||
|
),
|
||||||
dict(attrs={'data-element-id': ['article-element-authors']}),
|
dict(attrs={'data-element-id': ['article-element-authors']}),
|
||||||
dict(name=['script', 'noscript', 'style']),
|
dict(name=['script', 'noscript', 'style']),
|
||||||
]
|
]
|
||||||
@ -20,15 +32,17 @@ class Volkskrant(BasicNewsRecipe):
|
|||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
ignore_duplicate_articles = {'url'}
|
ignore_duplicate_articles = {'url'}
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup('https://www.volkskrant.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4()))
|
soup = self.index_to_soup(
|
||||||
|
'https://www.volkskrant.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4())
|
||||||
|
)
|
||||||
containers = soup.findAll('section', attrs={'class': 'section--horizontal'})
|
containers = soup.findAll('section', attrs={'class': 'section--horizontal'})
|
||||||
sections = []
|
sections = []
|
||||||
for container in containers:
|
for container in containers:
|
||||||
section_title = self.tag_to_string(container.find('h2')).strip()
|
section_title = self.tag_to_string(container.find('h2')).strip()
|
||||||
articles = []
|
articles = []
|
||||||
|
|
||||||
for art in container.findAll('article'):
|
for art in container.findAll('article'):
|
||||||
a = art.find('a')
|
a = art.find('a')
|
||||||
url = a['href']
|
url = a['href']
|
||||||
@ -37,9 +51,18 @@ class Volkskrant(BasicNewsRecipe):
|
|||||||
if '/editie/' not in url:
|
if '/editie/' not in url:
|
||||||
continue
|
continue
|
||||||
header = a.find('header')
|
header = a.find('header')
|
||||||
teaser_label = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__label'})).strip()
|
teaser_label = self.tag_to_string(
|
||||||
teaser_sublabel = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__sublabel'})).strip()
|
header.find('h4').find('span', attrs={'class': 'teaser__label'})
|
||||||
teaser_title = self.tag_to_string(header.find('h3').find('span', attrs={'class': 'teaser__title__value--short'})).strip()
|
).strip()
|
||||||
|
teaser_sublabel = self.tag_to_string(
|
||||||
|
header.find('h4'
|
||||||
|
).find('span', attrs={'class': 'teaser__sublabel'})
|
||||||
|
).strip()
|
||||||
|
teaser_title = self.tag_to_string(
|
||||||
|
header.find('h3').find(
|
||||||
|
'span', attrs={'class': 'teaser__title__value--short'}
|
||||||
|
)
|
||||||
|
).strip()
|
||||||
if teaser_label.lower() == "podcast":
|
if teaser_label.lower() == "podcast":
|
||||||
continue
|
continue
|
||||||
parts = []
|
parts = []
|
||||||
@ -52,12 +75,16 @@ class Volkskrant(BasicNewsRecipe):
|
|||||||
article_title = ' \u2022 '.join(parts)
|
article_title = ' \u2022 '.join(parts)
|
||||||
pubdate = ''
|
pubdate = ''
|
||||||
description = ''
|
description = ''
|
||||||
articles.append(dict(title=article_title,
|
articles.append(
|
||||||
url=url,
|
dict(
|
||||||
date=pubdate,
|
title=article_title,
|
||||||
description=description,
|
url=url,
|
||||||
content=''))
|
date=pubdate,
|
||||||
|
description=description,
|
||||||
|
content=''
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
sections.append((section_title, articles))
|
sections.append((section_title, articles))
|
||||||
return sections
|
return sections
|
||||||
|
|
||||||
@ -66,4 +93,4 @@ class Volkskrant(BasicNewsRecipe):
|
|||||||
if tag.name == 'img':
|
if tag.name == 'img':
|
||||||
if tag['src'][0] == '/':
|
if tag['src'][0] == '/':
|
||||||
tag['src'] = 'https://www.volkskrant.nl' + tag['src']
|
tag['src'] = 'https://www.volkskrant.nl' + tag['src']
|
||||||
return soup
|
return soup
|
||||||
|
Loading…
x
Reference in New Issue
Block a user