Update FAZ.NET

2025-07-09 03:04:10 -04:00 · 2017-09-06 07:02:28 +05:30 · 2017-09-06 07:02:28 +05:30 · ccc3be19ac
commit ccc3be19ac
parent e767a32d39
1 changed files with 60 additions and 71 deletions
--- a/recipes/faznet.recipe
+++ b/recipes/faznet.recipe
@ -1,15 +1,19 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import unicode_literals, division, absolute_import, print_function
+from calibre.web.feeds.news import BasicNewsRecipe
 __license__   = 'GPL v3'
 __copyright__ = '2008-2011, Kovid Goyal <kovid at kovidgoyal.net>, Darko Miletic <darko at gmail.com>'
-'''
-Profile to download FAZ.NET
-'''
-
-from calibre.web.feeds.news import BasicNewsRecipe


 class FazNet(BasicNewsRecipe):
+    # Version 8.0
+    # Update 2017-09-01
+    # Armin Geller
+    # new web page layout
+
    title                 = 'FAZ.NET'
-    __author__ = 'Kovid Goyal, Darko Miletic, Armin Geller'  # AGe upd. V7 2016-01-26
+    __author__            = 'Kovid Goyal, Darko Miletic, Armin Geller'
    description           = 'Frankfurter Allgemeine Zeitung'
    publisher             = 'Frankfurter Allgemeine Zeitung GmbH'
    category              = 'news, politics, Germany'
@ -21,20 +25,18 @@ class FazNet(BasicNewsRecipe):
    encoding              = 'utf-8'
    remove_javascript     = True

-    keep_only_tags = [
-        {'class': ['FAZArtikelEinleitung']},
-        dict(name='div', attrs={'class': 'FAZSlimHeader'}),
-        {'id': 'ArtikelTabContent_0'}
-    ]
+    keep_only_tags = [dict(name='article', attrs={'class':'atc'})]
+
+    remove_tags_after = [dict(name='article', attrs={'class':['atc']})]

-    remove_tags_after = [dict(name='div', attrs={'class': ['ArtikelFooter']})]
    remove_tags = [
-        dict(name='div', attrs={'class': ['ArtikelFooter', 'clear']}),
-        # AGe 2016-01-26
-        dict(name='div', attrs={'id': ['berndsbox', 'dertagbox']}),
-        dict(name='a', attrs={'title': ['Vergrößern']}),  # AGe 2014-10-22
-        dict(name='img', attrs={'class': ['VideoCtrlIcon']}),  # AGe 2014-10-22
-        dict(name='span', attrs={'class': ['shareAutor']})  # AGe 2014-10-22
+                    dict(name='aside', attrs={'class':['atc-ContainerMore ',
+                                                       'atc-ContainerMore atc-ContainerMoreOneTeaser sld-TeaserMoreOneTeaser  js-slider-teaser-more'
+                                                       ]}),
+                    dict(name='div', attrs={'class':['atc-ContainerSocialMedia',
+                                                     'atc-ContainerFunctions_Interaction ',
+                                                     'ctn-PlaceholderContent ctn-PlaceholderContent-is-in-article-medium '
+                                                     ]})
                  ]

    feeds = [
@ -53,36 +55,25 @@ class FazNet(BasicNewsRecipe):
                ('Rhein-Main', 'http://www.faz.net/aktuell/rhein-main/?rssview=1')
            ]

-# AGe 2014-01-10 For multipages
+    # For multipages:
+
    INDEX = ''

    def append_page(self, soup, appendtag, position):
-        pager = soup.find('a', attrs={'title': 'Nächste Seite'})
+        pager = soup.find('li',attrs={'class':'nvg-Paginator_Item nvg-Paginator_Item-to-next-page'})
        if pager:
-            nexturl = self.INDEX + pager['href']
+            nexturl = self.INDEX + pager.a['href']
            soup2 = self.index_to_soup(nexturl)
-            texttag = soup2.find('div', attrs={'class': 'FAZArtikelContent'})
+            texttag = soup2.find('article', attrs={'class':'atc'})
            for cls in (
-                    'ArtikelFooter', 'ArtikelAbbinder',
-                    'ArtikelKommentieren Artikelfuss GETS;tk;boxen.top-lesermeinungen;tp;content',
-                    'Anzeige GoogleAdsBuehne', 'ThemenLinks', 'rechtehinweis',
-                    'stageModule Ressortmodul Rubrikenkopf clearfix', 'VideoCtrlIcon',
-                    'ArtikelAbbinder clearfix',
-                    'stageModule clearfix GETS;tk;artikel.empfehlungen.weitere-artikel;tp;content',
-                    'ThemenLinks',
-            ):  # AGe 2014-10-22
+                    'atc-Header',
+                    'ctn-PlaceholderContent ctn-PlaceholderContent-is-in-article-medium ',
+                    'ctn-PlaceholderContent ctn-PlaceholderContent-is-in-article-medium ctn-PlaceholderContent-has-centered-content ',
+                    'atc-ContainerMore '
+                    ):
                div = texttag.find(attrs={'class':cls})
                if div is not None:
                    div.extract()
-            for cls in (
-                    'berndsbox', 'dertagbox'):  # AGe 2016-01-26
-                div = texttag.find(attrs={'id': cls})
-                if div is not None:
-                    div.extract()
-                # AGe 2014-10-22
-                div = texttag.find(attrs={'title': 'Vergrößern'})
-                if div is not None:
-                    div.extract()
            newpos = len(texttag.contents)
            self.append_page(soup2,texttag,newpos)
            texttag.extract()
@ -95,10 +86,8 @@ class FazNet(BasicNewsRecipe):
            img['src'] = img['data-src']
        return self.adeify_images(soup)

+    # Some last cleanup
    def postprocess_html(self, soup, first_fetch):
-        for div in soup.findAll(id='ArticlePagerBottom'):
-            div.extract()
-        # AGe add 2014-10-24
-        for div in soup.findAll('div', attrs={'class': 'clear'}):
+        for div in soup.findAll('div',attrs={'class':['atc-ContainerFunctions_Navigation','atc-ContainerFunctions_Interaction ']}):
            div.extract()
        return soup