diff --git a/recipes/nrc.nl.recipe b/recipes/nrc.nl.recipe index cab485e846..5954092a61 100644 --- a/recipes/nrc.nl.recipe +++ b/recipes/nrc.nl.recipe @@ -2,11 +2,11 @@ from calibre.web.feeds.recipes import BasicNewsRecipe import datetime import json -from time import sleep from mechanize import Request from contextlib import closing import re + class NRC(BasicNewsRecipe): title = 'NRC' __author__ = 'Cristi Ghera' @@ -17,18 +17,24 @@ class NRC(BasicNewsRecipe): country = 'NL' category = 'news, politics, Netherlands' resolve_internal_links = True - remove_tags_before = {'class':'article__header-and-content'} - remove_tags_after = {'class':'article__header-and-content'} + remove_tags_before = {'class': 'article__header-and-content'} + remove_tags_after = {'class': 'article__header-and-content'} remove_tags = [ - dict(attrs={'class':['article__footer', - 'lees-ook', - 'luister-naar', - 'print-layout-warning', - 'newslettersignup', - 'article__byline', - 'article__published-in', - 'article__featured-image__caption__producer', - 'metabox',]}), + dict( + attrs={ + 'class': [ + 'article__footer', + 'lees-ook', + 'luister-naar', + 'print-layout-warning', + 'newslettersignup', + 'article__byline', + 'article__published-in', + 'article__featured-image__caption__producer', + 'metabox', + ] + } + ), dict(name=['script', 'noscript', 'style']), ] remove_attributes = ["class", "id", "name", "style"] @@ -36,24 +42,26 @@ class NRC(BasicNewsRecipe): no_stylesheets = True ignore_duplicate_articles = {'url'} delay = 0.3 - + touchscreen = True - + frontpage = None - + title_regexp = None - + @staticmethod def _monthly_list_url(date, fmt="%Y/%m/"): return "https://www.nrc.nl/de/data/NH/" + date.strftime(fmt) - + def _clean_article_title(self, title): if not title: return title if self.title_regexp is None: - self.title_regexp = re.compile(r'([^<]+)\s*') + self.title_regexp = re.compile( + r'([^<]+)\s*' + ) return self.title_regexp.sub(r"\1 ", title) - + def parse_index(self): sections = [] today = datetime.date.today() @@ -64,15 +72,22 @@ class NRC(BasicNewsRecipe): } monthly_list_urls = [ self._monthly_list_url(today), - self._monthly_list_url(datetime.date(today.year, today.month, 1) - datetime.timedelta(days=1)) + self._monthly_list_url( + datetime.date(today.year, today.month, 1) - + datetime.timedelta(days=1) + ) ] issue_url = None issue_date = None for monthly_list_url in monthly_list_urls: - with closing(self.browser.open(Request(monthly_list_url, None, headers))) as r: + with closing( + self.browser.open(Request(monthly_list_url, None, headers)) + ) as r: issues = json.loads(r.read()) if len(issues) > 0: - issue_date = datetime.datetime.strptime(issues[0]["published_at"], "%Y-%m-%dT%H:%M:%SZ") + issue_date = datetime.datetime.strptime( + issues[0]["published_at"], "%Y-%m-%dT%H:%M:%SZ" + ) issue_url = self._monthly_list_url(issue_date, "%Y/%m/%d/") self.frontpage = issues[0]["frontpage"] break @@ -93,14 +108,12 @@ class NRC(BasicNewsRecipe): if doc not in documents: self.log.warn('Document not found:', doc) continue - articles.append(dict( - title=documents[doc]["headline"], - url=documents[doc]["url"] - )) - sections.append(( - section["name"], - articles - )) + articles.append( + dict( + title=documents[doc]["headline"], url=documents[doc]["url"] + ) + ) + sections.append((section["name"], articles)) return sections def preprocess_html(self, soup): @@ -119,4 +132,4 @@ class NRC(BasicNewsRecipe): return soup def get_cover_url(self): - return self.frontpage \ No newline at end of file + return self.frontpage diff --git a/recipes/volksrant.recipe b/recipes/volksrant.recipe index 90338ec2fb..18741ee0a4 100644 --- a/recipes/volksrant.recipe +++ b/recipes/volksrant.recipe @@ -2,6 +2,7 @@ from calibre.web.feeds.recipes import BasicNewsRecipe import uuid + class Volkskrant(BasicNewsRecipe): title = 'Volkskrant' __author__ = 'Cristi Ghera' @@ -10,9 +11,20 @@ class Volkskrant(BasicNewsRecipe): needs_subscription = False resolve_internal_links = True remove_tags_before = dict(id='main-content') - remove_tags_after = dict(id='main-content') + remove_tags_after = dict(id='main-content') remove_tags = [ - dict(attrs={'class':['article-footer__sharing', 'artstyle__editorial-tips', 'artstyle__advertisement','artstyle__container__icon','artstyle__disabled-embed','container__title__icon',]}), + dict( + attrs={ + 'class': [ + 'article-footer__sharing', + 'artstyle__editorial-tips', + 'artstyle__advertisement', + 'artstyle__container__icon', + 'artstyle__disabled-embed', + 'container__title__icon', + ] + } + ), dict(attrs={'data-element-id': ['article-element-authors']}), dict(name=['script', 'noscript', 'style']), ] @@ -20,15 +32,17 @@ class Volkskrant(BasicNewsRecipe): encoding = 'utf-8' no_stylesheets = True ignore_duplicate_articles = {'url'} - + def parse_index(self): - soup = self.index_to_soup('https://www.volkskrant.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4())) + soup = self.index_to_soup( + 'https://www.volkskrant.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4()) + ) containers = soup.findAll('section', attrs={'class': 'section--horizontal'}) sections = [] for container in containers: section_title = self.tag_to_string(container.find('h2')).strip() articles = [] - + for art in container.findAll('article'): a = art.find('a') url = a['href'] @@ -37,9 +51,18 @@ class Volkskrant(BasicNewsRecipe): if '/editie/' not in url: continue header = a.find('header') - teaser_label = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__label'})).strip() - teaser_sublabel = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__sublabel'})).strip() - teaser_title = self.tag_to_string(header.find('h3').find('span', attrs={'class': 'teaser__title__value--short'})).strip() + teaser_label = self.tag_to_string( + header.find('h4').find('span', attrs={'class': 'teaser__label'}) + ).strip() + teaser_sublabel = self.tag_to_string( + header.find('h4' + ).find('span', attrs={'class': 'teaser__sublabel'}) + ).strip() + teaser_title = self.tag_to_string( + header.find('h3').find( + 'span', attrs={'class': 'teaser__title__value--short'} + ) + ).strip() if teaser_label.lower() == "podcast": continue parts = [] @@ -52,12 +75,16 @@ class Volkskrant(BasicNewsRecipe): article_title = ' \u2022 '.join(parts) pubdate = '' description = '' - articles.append(dict(title=article_title, - url=url, - date=pubdate, - description=description, - content='')) - + articles.append( + dict( + title=article_title, + url=url, + date=pubdate, + description=description, + content='' + ) + ) + sections.append((section_title, articles)) return sections @@ -66,4 +93,4 @@ class Volkskrant(BasicNewsRecipe): if tag.name == 'img': if tag['src'][0] == '/': tag['src'] = 'https://www.volkskrant.nl' + tag['src'] - return soup \ No newline at end of file + return soup