Merge branch 'master' of https://github.com/itsirc/calibre

2025-08-11 09:13:57 -04:00 · 2022-06-30 21:35:25 +05:30 · 2022-06-30 21:35:25 +05:30 · 248fbd3192
commit 248fbd3192
parent 49a1d469ce f96086a021
2 changed files with 179 additions and 136 deletions
--- a/recipes/nrc.nl.recipe
+++ b/recipes/nrc.nl.recipe
@ -1,55 +1,122 @@
-__license__ = 'GPL v3'
-__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
-'''
-nrc.nl
-'''
+#!/usr/bin/env  python
+from calibre.web.feeds.recipes import BasicNewsRecipe
+import datetime
+import json
+from time import sleep
+from mechanize import Request
+from contextlib import closing
+import re

-from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import Tag
-
-
-def new_tag(soup, name, attrs=()):
-    impl = getattr(soup, 'new_tag', None)
-    if impl is not None:
-        return impl(name, attrs=dict(attrs))
-    return Tag(soup, name, attrs=attrs or None)
-
-
-class Pagina12(BasicNewsRecipe):
+class NRC(BasicNewsRecipe):
    title = 'NRC'
-    __author__ = 'Darko Miletic'
-    description = 'News from Netherlands'
-    publisher = 'nrc.nl'
-    category = 'news, politics, Netherlands'
-    oldest_article = 2
-    max_articles_per_feed = 200
-    no_stylesheets = True
-    encoding = 'utf8'
-    use_embedded_content = False
+    __author__ = 'Cristi Ghera'
+    max_articles_per_feed = 100
+    description = 'NRC - Nieuws, achtergronden en onderzoeksjournalistiek'
+    needs_subscription = False
    language = 'nl'
    country = 'NL'
-    remove_empty_feeds = True
-    masthead_url = 'http://www.nrc.nl/nrc.nl/images/logo_nrc.png'
-
-    keep_only_tags = [
-        dict(name=['h1', 'figure']),
-        dict(attrs={'class': ['intro', 'byline']}),
-        dict(attrs={'class': lambda x: x and 'article__content' in x}),
+    category = 'news, politics, Netherlands'
+    resolve_internal_links = True
+    remove_tags_before = {'class':'article__header-and-content'}
+    remove_tags_after  = {'class':'article__header-and-content'}
+    remove_tags = [
+        dict(attrs={'class':['article__footer',
+                             'lees-ook',
+                             'luister-naar',
+                             'print-layout-warning',
+                             'newslettersignup',
+                             'article__byline',
+                             'article__published-in',
+                             'article__featured-image__caption__producer',
+                             'metabox',]}),
+        dict(name=['script', 'noscript', 'style']),
    ]
-    remove_attributes = ['style']
+    remove_attributes = ["class", "id", "name", "style"]
+    encoding = 'utf-8'
+    no_stylesheets = True
+    ignore_duplicate_articles = {'url'}
+    delay = 0.3
    
-    feeds = ['http://www.nrc.nl/rss/']
+    touchscreen = True
+    
+    frontpage = None
+    
+    title_regexp = None
+    
+    @staticmethod
+    def _monthly_list_url(date, fmt="%Y/%m/"):
+        return "https://www.nrc.nl/de/data/NH/" + date.strftime(fmt)
+    
+    def _clean_article_title(self, title):
+        if not title:
+            return title
+        if self.title_regexp is None:
+            self.title_regexp = re.compile(r'<span class="keyword">([^<]+)</span>\s*')
+        return self.title_regexp.sub(r"\1 ", title)
+    
+    def parse_index(self):
+        sections = []
+        today = datetime.date.today()
+        headers = {
+            'X-Requested-With': 'XMLHttpRequest',
+            'Accept': 'application/json, text/javascript, */*; q=0.01',
+            'DNT': '1',
+        }
+        monthly_list_urls = [
+            self._monthly_list_url(today),
+            self._monthly_list_url(datetime.date(today.year, today.month, 1) - datetime.timedelta(days=1))
+        ]
+        issue_url = None
+        issue_date = None
+        for monthly_list_url in monthly_list_urls:
+            with closing(self.browser.open(Request(monthly_list_url, None, headers))) as r:
+                issues = json.loads(r.read())
+                if len(issues) > 0:
+                    issue_date = datetime.datetime.strptime(issues[0]["published_at"], "%Y-%m-%dT%H:%M:%SZ")
+                    issue_url = self._monthly_list_url(issue_date, "%Y/%m/%d/")
+                    self.frontpage = issues[0]["frontpage"]
+                    break
+        if issue_url is None:
+            return []
+        with closing(self.browser.open(Request(issue_url, None, headers))) as r:
+            edition = json.loads(r.read())
+        documents = {}
+        for headline in edition["paperheadlines"]:
+            item = headline["item"]
+            documents[headline["document_id"]] = dict(
+                url=item["full_url"],
+                headline=self._clean_article_title(item["headline"])
+            )
+        for section in edition["sections"]:
+            articles = []
+            for doc in section["document_ids"]:
+                if doc not in documents:
+                    self.log.warn('Document not found:', doc)
+                    continue
+                articles.append(dict(
+                    title=documents[doc]["headline"],
+                    url=documents[doc]["url"]
+                ))
+            sections.append((
+                section["name"],
+                articles
+            ))
+        return sections

    def preprocess_html(self, soup):
-        src = None
-        for meta in soup.findAll('meta', itemprop='image', content=True):
-            src = meta['content']
-            break
-        if src is not None:
-            div = soup.find(
-                'div', attrs={'class': lambda x: x and 'featured-img' in x})
-            if div is not None:
-                img = new_tag(soup, 'img')
-                img['src'] = src
-                div.append(img)
+        for tag in soup():
+            if tag.name == 'img':
+                if tag.has_attr('data-src-medium'):
+                    tag['src'] = tag['data-src-medium'].split("|")[0]
+                elif tag.has_attr('data-src'):
+                    tag['src'] = tag['data-src'].split("|")[0]
+                if tag['src'].startswith('//'):
+                    tag['src'] = 'https:' + tag['src']
+                elif tag['src'].startswith('/'):
+                    tag['src'] = 'https://www.nrc.nl' + tag['src']
+        if self.browser.cookiejar:
+            self.browser.cookiejar.clear()
        return soup
+
+    def get_cover_url(self):
+        return self.frontpage
--- a/recipes/volksrant.recipe
+++ b/recipes/volksrant.recipe
@ -1,93 +1,69 @@
-# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
-from __future__ import with_statement
+#!/usr/bin/env  python
+from calibre.web.feeds.recipes import BasicNewsRecipe
+import uuid

-__license__ = 'GPL v3'
-__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
-__docformat__ = 'restructuredtext en'
-
-'''
- Modified by Tony Stegall
- on 10/10/10 to include function to grab print version of articles
-'''
-from datetime import date
-from calibre.web.feeds.news import BasicNewsRecipe
-'''
-added by Tony Stegall
-'''
-#######################################################
-from calibre.ptempfile import PersistentTemporaryFile
-#######################################################
-
-
-class AdvancedUserRecipe1249039563(BasicNewsRecipe):
-    title = u'De Volkskrant'
-    __author__ = 'acidzebra'
-    oldest_article = 7
+class Volkskrant(BasicNewsRecipe):
+    title = 'Volkskrant'
+    __author__ = 'Cristi Ghera'
    max_articles_per_feed = 100
+    description = 'Volkskrant - Nieuws, achtergronden en columns'
+    needs_subscription = False
+    resolve_internal_links = True
+    remove_tags_before = dict(id='main-content')
+    remove_tags_after  = dict(id='main-content')
+    remove_tags = [
+        dict(attrs={'class':['article-footer__sharing', 'artstyle__editorial-tips', 'artstyle__advertisement','artstyle__container__icon','artstyle__disabled-embed','container__title__icon',]}),
+        dict(attrs={'data-element-id': ['article-element-authors']}),
+        dict(name=['script', 'noscript', 'style']),
+    ]
+    remove_attributes = ["class", "id", "name", "style"]
+    encoding = 'utf-8'
    no_stylesheets = True
-    language = 'nl'
+    ignore_duplicate_articles = {'url'}
    
-    extra_css      = '''
-                        body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
-                        h1{font-size:large;}
-                     '''
-    '''
-      Change Log:
-        Date:       10/10/10  - Modified code to include obfuscated to get the print version
-        Author:   Tony Stegall
+    def parse_index(self):
+        soup = self.index_to_soup('https://www.volkskrant.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4()))
+        containers = soup.findAll('section', attrs={'class': 'section--horizontal'})
+        sections = []
+        for container in containers:
+            section_title = self.tag_to_string(container.find('h2')).strip()
+            articles = []
            
-        Date:       01/01/11  - Modified for better results around December/January.
-        Author:   Martin Tarenskeen
-    '''
-    # #########################################################################
-    temp_files = []
-    articles_are_obfuscated = True
+            for art in container.findAll('article'):
+                a = art.find('a')
+                url = a['href']
+                if url[0] == '/':
+                    url = 'https://www.volkskrant.nl' + url
+                if '/editie/' not in url:
+                    continue
+                header = a.find('header')
+                teaser_label = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__label'})).strip()
+                teaser_sublabel = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__sublabel'})).strip()
+                teaser_title = self.tag_to_string(header.find('h3').find('span', attrs={'class': 'teaser__title__value--short'})).strip()
+                if teaser_label.lower() == "podcast":
+                    continue
+                parts = []
+                if teaser_label:
+                    parts.append(teaser_label.upper())
+                if teaser_sublabel:
+                    parts.append(teaser_sublabel)
+                if teaser_title:
+                    parts.append(teaser_title)
+                article_title = ' \u2022 '.join(parts)
+                pubdate = ''
+                description = ''
+                articles.append(dict(title=article_title,
+                                    url=url,
+                                    date=pubdate,
+                                    description=description,
+                                    content=''))
            
-    def get_obfuscated_article(self, url):
-        br = self.browser.clone_browser()
-        br.open(url)
-        year = date.today().year
+            sections.append((section_title, articles))
+        return sections

-        try:
-            response = br.follow_link(
-                url_regex='.*?(%d)(\\/)(article)(\\/)(print)(\\/)' % year, nr=0)
-            html = response.read()
-        except:
-            year = year - 1
-            try:
-                response = br.follow_link(
-                    url_regex='.*?(%d)(\\/)(article)(\\/)(print)(\\/)' % year, nr=0)
-                html = response.read()
-            except:
-                response = br.open(url)
-                html = response.read()
-
-        self.temp_files.append(PersistentTemporaryFile('_fa.html'))
-        self.temp_files[-1].write(html)
-        self.temp_files[-1].close()
-        return self.temp_files[-1].name
-
-    # #########################################################################
-
-    '''
-      Change Log:
-       Date: 10/15/2010
-       Feeds updated by Martin Tarenskeen
-       Date: 09/09/2012
-       Feeds updated by Eric Lammerts
-    '''
-
-    feeds = [
-        (u'Nieuws', u'http://www.volkskrant.nl/nieuws/rss.xml'),
-        (u'Binnenland', u'http://www.volkskrant.nl/nieuws/binnenland/rss.xml'),
-        (u'Buitenland', u'http://www.volkskrant.nl/buitenland/rss.xml'),
-        (u'Economie', u'http://www.volkskrant.nl/nieuws/economie/rss.xml'),
-        (u'Politiek', u'http://www.volkskrant.nl/politiek/rss.xml'),
-        (u'Sport', u'http://www.volkskrant.nl/sport/rss.xml'),
-        (u'Cultuur', u'http://www.volkskrant.nl/nieuws/cultuur/rss.xml'),
-        (u'Gezondheid & wetenschap',
-         u'http://www.volkskrant.nl/nieuws/gezondheid--wetenschap/rss.xml'),
-        (u'Tech & Media', u'http://www.volkskrant.nl/tech-media/rss.xml'),
-        (u'Reizen', u'http://www.volkskrant.nl/nieuws/reizen/rss.xml'),
-        (u'Opinie', u'http://www.volkskrant.nl/opinie/rss.xml'),
-        (u'Opmerkelijk', u'http://www.volkskrant.nl/nieuws/opmerkelijk/rss.xml')]
+    def preprocess_html(self, soup):
+        for tag in soup():
+            if tag.name == 'img':
+                if tag['src'][0] == '/':
+                    tag['src'] = 'https://www.volkskrant.nl' + tag['src']
+        return soup