Update FAZ.NET

2025-07-09 03:04:10 -04:00 · 2017-09-06 07:02:28 +05:30 · 2017-09-06 07:02:28 +05:30 · ccc3be19ac
commit ccc3be19ac
parent e767a32d39
1 changed files with 60 additions and 71 deletions
--- a/recipes/faznet.recipe
+++ b/recipes/faznet.recipe
@ -1,15 +1,19 @@
 #!/usr/bin/env python2
 # vim:fileencoding=utf-8
 from __future__ import unicode_literals, division, absolute_import, print_function
 from calibre.web.feeds.news import BasicNewsRecipe
 __license__   = 'GPL v3'
 __copyright__ = '2008-2011, Kovid Goyal <kovid at kovidgoyal.net>, Darko Miletic <darko at gmail.com>'
 '''
 Profile to download FAZ.NET
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class FazNet(BasicNewsRecipe):
    # Version 8.0
    # Update 2017-09-01
    # Armin Geller
    # new web page layout
    title                 = 'FAZ.NET'
-    __author__ = 'Kovid Goyal, Darko Miletic, Armin Geller'  # AGe upd. V7 2016-01-26
+    __author__            = 'Kovid Goyal, Darko Miletic, Armin Geller'
    description           = 'Frankfurter Allgemeine Zeitung'
    publisher             = 'Frankfurter Allgemeine Zeitung GmbH'
    category              = 'news, politics, Germany'
@ -21,20 +25,18 @@ class FazNet(BasicNewsRecipe):
    encoding              = 'utf-8'
    remove_javascript     = True
-    keep_only_tags = [
+    keep_only_tags = [dict(name='article', attrs={'class':'atc'})]
-        {'class': ['FAZArtikelEinleitung']},
+
-        dict(name='div', attrs={'class': 'FAZSlimHeader'}),
+    remove_tags_after = [dict(name='article', attrs={'class':['atc']})]
        {'id': 'ArtikelTabContent_0'}
    ]
    remove_tags_after = [dict(name='div', attrs={'class': ['ArtikelFooter']})]
    remove_tags = [
-        dict(name='div', attrs={'class': ['ArtikelFooter', 'clear']}),
+                    dict(name='aside', attrs={'class':['atc-ContainerMore ',
-        # AGe 2016-01-26
+                                                       'atc-ContainerMore atc-ContainerMoreOneTeaser sld-TeaserMoreOneTeaser  js-slider-teaser-more'
-        dict(name='div', attrs={'id': ['berndsbox', 'dertagbox']}),
+                                                       ]}),
-        dict(name='a', attrs={'title': ['Vergrößern']}),  # AGe 2014-10-22
+                    dict(name='div', attrs={'class':['atc-ContainerSocialMedia',
-        dict(name='img', attrs={'class': ['VideoCtrlIcon']}),  # AGe 2014-10-22
+                                                     'atc-ContainerFunctions_Interaction ',
-        dict(name='span', attrs={'class': ['shareAutor']})  # AGe 2014-10-22
+                                                     'ctn-PlaceholderContent ctn-PlaceholderContent-is-in-article-medium '
                                                     ]})
                  ]
    feeds = [
@ -53,36 +55,25 @@ class FazNet(BasicNewsRecipe):
                ('Rhein-Main', 'http://www.faz.net/aktuell/rhein-main/?rssview=1')
            ]
-# AGe 2014-01-10 For multipages
+    # For multipages:
    INDEX = ''
    def append_page(self, soup, appendtag, position):
-        pager = soup.find('a', attrs={'title': 'Nächste Seite'})
+        pager = soup.find('li',attrs={'class':'nvg-Paginator_Item nvg-Paginator_Item-to-next-page'})
        if pager:
-            nexturl = self.INDEX + pager['href']
+            nexturl = self.INDEX + pager.a['href']
            soup2 = self.index_to_soup(nexturl)
-            texttag = soup2.find('div', attrs={'class': 'FAZArtikelContent'})
+            texttag = soup2.find('article', attrs={'class':'atc'})
            for cls in (
-                    'ArtikelFooter', 'ArtikelAbbinder',
+                    'atc-Header',
-                    'ArtikelKommentieren Artikelfuss GETS;tk;boxen.top-lesermeinungen;tp;content',
+                    'ctn-PlaceholderContent ctn-PlaceholderContent-is-in-article-medium ',
-                    'Anzeige GoogleAdsBuehne', 'ThemenLinks', 'rechtehinweis',
+                    'ctn-PlaceholderContent ctn-PlaceholderContent-is-in-article-medium ctn-PlaceholderContent-has-centered-content ',
-                    'stageModule Ressortmodul Rubrikenkopf clearfix', 'VideoCtrlIcon',
+                    'atc-ContainerMore '
-                    'ArtikelAbbinder clearfix',
+                    ):
                    'stageModule clearfix GETS;tk;artikel.empfehlungen.weitere-artikel;tp;content',
                    'ThemenLinks',
            ):  # AGe 2014-10-22
                div = texttag.find(attrs={'class':cls})
                if div is not None:
                    div.extract()
            for cls in (
                    'berndsbox', 'dertagbox'):  # AGe 2016-01-26
                div = texttag.find(attrs={'id': cls})
                if div is not None:
                    div.extract()
                # AGe 2014-10-22
                div = texttag.find(attrs={'title': 'Vergrößern'})
                if div is not None:
                    div.extract()
            newpos = len(texttag.contents)
            self.append_page(soup2,texttag,newpos)
            texttag.extract()
@ -95,10 +86,8 @@ class FazNet(BasicNewsRecipe):
            img['src'] = img['data-src']
        return self.adeify_images(soup)
    # Some last cleanup
    def postprocess_html(self, soup, first_fetch):
-        for div in soup.findAll(id='ArticlePagerBottom'):
+        for div in soup.findAll('div',attrs={'class':['atc-ContainerFunctions_Navigation','atc-ContainerFunctions_Interaction ']}):
            div.extract()
        # AGe add 2014-10-24
        for div in soup.findAll('div', attrs={'class': 'clear'}):
            div.extract()
        return soup