Faz.net by Anonymous

2025-12-22 04:47:20 -05:00 · 2025-01-12 20:58:14 +05:30 · 2025-01-12 20:58:14 +05:30 · fd20bf9baa
commit fd20bf9baa
parent 5e8faec6eb
2 changed files with 276 additions and 1 deletions
--- a/recipes/faz_net.recipe
+++ b/recipes/faz_net.recipe
@ -0,0 +1,275 @@
+import json
+import re
+
+from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
+
+
+def format_tickaroo_liveblog(soup):
+    for img in soup.findAll('img', attrs={'class':'tik4-media-image__img','srcset':True}):
+        sources = img['srcset'].split()
+        i=0
+        for x in sources:
+            if x == '960w,' or x == '960w':
+                img['src'] = sources[i-1]
+                break
+            i = i + 1
+        if not img.has_attr('src'):
+            img['src'] = sources[0]
+    for div in soup.findAll('div', attrs={'class':'tik4-content-block tik4-content-block--rich-text tik4-content-block--position-2'}):
+        div.insert_before(soup.new_tag('br'))
+
+
+    #format liveblogs
+    for tag in soup.findAll('time'):
+        ntag = soup.new_tag("br")
+        tag.insert_before(ntag)
+
+    for tag in soup.findAll(class_ = 'tik4-author__wrapper'):
+        ntag = tag.find(class_ = 'tik4-author__name')
+        if ntag:
+            temp = ntag.extract()
+            temp['class'] = 'tik4-media-body__title'
+        ntag = tag.find(class_ = 'tik4-author__thumb')
+        if ntag and temp:
+            ntag.insert_after(temp)
+
+    # process run of images
+def bilderstrecke(soup,tag):
+    flag = False
+    try:
+        struct = json.loads(str(tag.contents[0]))
+    except Exception:
+        return
+
+    if struct and isinstance(struct, list):
+        for v in struct:
+            if isinstance(v, dict) and 'caption' in v:
+                flag = True
+                break
+    if not flag:
+        return
+
+    temp=soup.findAll(class_='header-teaser')
+    if len(temp) > 1:
+        temp[0].extract()
+    collect = soup.new_tag('div')
+
+    for v in struct:
+        if isinstance(v, dict) and 'caption' in v and 'defaultUrl' in v:
+#            if type(struct[i-1])== str:
+#                head = soup.new_tag("h4")
+#                head.append(struct[i-1])
+            cap = soup.new_tag('p')
+            cap.append(struct[int(v['caption'])])
+            cap['class'] = "body-elements__image-figcaption"
+            if 'source' in v.keys():
+                cred = soup.new_tag('span')
+                cred.append(struct[int(v['source'])])
+                cred['class'] = "body-elements__image-figcaption--source"
+                cap.append(cred)
+            if 'defaultUrl' in v.keys():
+                fig = soup.new_tag("figure")
+                img = soup.new_tag('img')
+                img['src'] = struct[int(v['defaultUrl'])]
+                fig.append(img)
+                fig.append(cap)
+                collect.append(fig)
+    soup.find(class_='header-teaser').insert_after(collect)
+
+
+    for tag in soup.findAll(class_='header-teaser__image--default'):
+        tag.extract()
+
+def story(soup,tag):
+    first_image = soup.find('img',attrs={'loading':'lazy'})
+    first_caption = soup.find('figcaption',attrs={'class':'caption'})
+    if first_image and first_caption:
+        first_image.insert_after(first_caption.extract())
+
+
+class FazNet(BasicNewsRecipe):
+    # Version 9.1m
+    # Update 2024-05
+    # original by Armin Geller
+    # overhaul to deal with changes in the faz.net websites
+
+    title                 =             'FAZ.NET'
+    __author__            =             'Unknown'
+    description           =             'Frankfurter Allgemeine Zeitung'
+    publisher             =             'Frankfurter Allgemeine Zeitung GmbH'
+    category              =             'news, politics, Germany'
+    cover_url             =             'https://upload.wikimedia.org/wikipedia/commons/7/72/Frankfurter_Allgemeine_logo.svg'
+    encoding              =             'utf-8'
+    language              =             'de'
+    ignore_duplicate_articles   =       {'title', 'url'}
+    max_articles_per_feed =             30
+    no_stylesheets        =             True
+    remove_javascript     =             True
+    scale_news_images = (10,100)
+    delay                 =      1
+
+    test_article = 'https://www.faz.net/rss/aktuell/feuilleton/kunst-und-architektur/berlinische-galerie-zeigt-edvard-munch-die-ganze-gefuehlsskala-des-lebens-19180631.html?printPagedArticle=true#pageIndex_2'
+    test_article = None
+
+    extra_css      =  '''
+        .header-title,.scrolly-title {font-size: 1.5em; font-weight:bold; text-align:left;}
+        .quote {font-size: 1.5em; font-weight:bold; text-align:center;}
+        .author {font-size: 0.7em; font-weight:bold; text-align:center; display:block;
+            margin-bottom: 0.95 em; color:grey;}
+        .header-label__content {font-size: 0.7em; font-weight:bold; text-align:left; display:block;
+            margin-bottom: 0.95 em; color:grey;}
+        h3 {font-size:1.3em;text-align:left;}
+        .caption,.body-elements__image-figcaption,.header-teaser__image-details,.tik4-media-body__title,.scrolly-text {
+                margin-top:0.05em;margin-bottom:1em; font-size: 0.85em; text-align:left;}
+        .body-elements__image-figcaption--source,.header-teaser__image-details--source,.tik4-media-body__credit {
+                font-size: 0.65em; font-style:italic; text-align:left;margin-left:0.4em;}
+        .header-detail--bold {font-size:0.6em; font-weight:bold; margin-bottom:0.75em;text-align:left;}
+        time {font-size:0.6em; font-weight: normal; margin-bottom:0.75em; text-align:left; display:block;}
+        .header-teaser,.scrolly-intro {font-size:1em; font-style:italic; font-weight:bold;margin-bottom:1em;}
+        .tik4-media-image {margin-bottom:1em;margin-top:1em;}
+        '''
+
+    keep_only_tags = [dict(name='article', attrs={'class':['article','storytelling']}),
+                      dict(name='body'),
+                      dict(name='div', attrs={'class':['imageGallery','image_only']}),
+                      dict(name = 'div', attrs ={'class':'tik4-live__container'}),
+                      dict(name = 'script', attrs = {'id':'__NUXT_DATA__'}),
+                      ]
+
+
+    remove_tags = [
+                   dict(name='div', attrs={'class':[
+                       'related-articles','consent-placeholder',
+                       'article-footer content-container',
+                       'tik4-sharing','tik4-load-more-bottom',
+                       'tik4-by','header-detail__image','mm-adbox','upper-toolbar content-container'
+                   ]}),
+  #                 dict(name ='script'),
+                   dict(name = "style"),
+                   dict(name='svg'),
+                   dict(name='div', attrs={'data-module':'teaser'}),
+
+                  ]
+
+    remove_attributes = ['onclick']
+
+
+    test_article = False
+    if not test_article:
+        feeds = [
+                 ('FAZ.NET Aktuell', 'https://www.faz.net/rss/aktuell/'),
+                 ('Politik', 'https://www.faz.net/rss/aktuell/politik/'),
+                 ('Wirtschaft', 'https://www.faz.net/rss/aktuell/wirtschaft/'),
+                 ('Feuilleton', 'https://www.faz.net/rss/aktuell/feuilleton/'),
+                 ('Sport', 'https://www.faz.net/rss/aktuell/sport/'),
+                 ('Lebensstil', 'https://www.faz.net/rss/aktuell/lebensstil/'),
+                 ('Gesellschaft', 'https://www.faz.net/rss/aktuell/gesellschaft/'),
+                 ('Finanzen', 'https://www.faz.net/rss/aktuell/finanzen/'),
+                 ('Technik & Motor', 'https://www.faz.net/rss/aktuell/technik-motor/'),
+                 ('Wissen', 'https://www.faz.net/rss/aktuell/wissen/'),
+                 ('Reise', 'https://www.faz.net/rss/aktuell/reise/'),
+                 ('Karriere & Hochschule', 'https://www.faz.net/rss/aktuell/karriere-hochschule/'),
+                 ('Rhein-Main', 'https://www.faz.net/rss/aktuell/rhein-main/')
+                ]
+    else:
+        def parse_index(self):
+            test_article = 'https://www.faz.net/aktuell/stil/mode-im-em-jahr-wir-zeigen-wie-fussball-und-mode-zusammengehoeren-19766969.html'
+#            test_article = 'https://www.faz.net/aktuell/feuilleton/buecher/film-eruption-ein-thriller-aus-dem-nachlass-von-michael-crichton-19770491.html'
+#            test_article = 'https://www.faz.net/aktuell/stil/mode-design/leonie-benesch-sandra-hueller-ist-eine-meiner-heldinnen-19671638.html'
+#            test_article = 'https://www.faz.net/aktuell/feuilleton/medien/sabine-postel-zum-siebzigsten-die-briten-nannten-sie-german-traktor-19708409.html'
+#            test_article = 'https://www.faz.net/aktuell/stil/mode-design/von-richert-beil-bis-william-fan-wer-kauft-denn-das-19666592.html'
+ #           test_article = 'https://www.faz.net/aktuell/feuilleton/buecher/rezensionen/sachbuch/tom-mustills-buch-die-sprache-der-wale-19657782.html'
+            if test_article:
+                return [('Articles', [{'title': 'Test article', 'url': test_article}])]
+            soup = self.index_to_soup(self.INDEX)
+            img = soup.find(**prefixed_classes('IssueDescription_cover__'))
+            if img is not None:
+                self.cover_url = img['src']
+            current_section, current_articles = 'Cover Story', []
+            feeds = []
+            for x in soup.findAll(**prefixed_classes('TocFeaturedSection_heading__ TocSection_heading__ TocHeroGridItem_hedLink___ TocGridItem_hedLink__')):
+                cls = x['class']
+                if not isinstance(cls, str):
+                    cls = ' '.join(cls)
+                title = self.tag_to_string(x).strip()
+                if 'Section' in cls:
+                    if current_articles:
+                        feeds.append((current_section, current_articles))
+                    current_section, current_articles = title, []
+                    self.log(current_section)
+                    continue
+                url = x['href']
+                current_articles.append({'title': title, 'url': url})
+                self.log('\t', title, url)
+            if current_articles:
+                feeds.append((current_section, current_articles))
+            return feeds
+
+    def preprocess_html(self, soup):
+        # Format story-type article
+        tag = soup.find(class_='storyContainer')
+        if tag:
+            story(soup,tag)
+
+        #Extract images and text from image galleries
+        for par in soup.findAll('p'):
+            if len(par.contents) == 1:
+                cont = str(par.contents[0])
+                if re.search(r"^[1-9]\d* Bilder$",cont):
+#                    print(cont)
+                    for tag in soup.findAll('script',attrs={'id':"__NUXT_DATA__",'type':'application/json'}):
+                        bilderstrecke(soup,tag)
+                        break
+                    break
+
+        # unwrap buttons
+        for tag in soup.findAll('button'):
+            tag.unwrap()
+
+        # remove ":""
+        tag = soup.find(class_ ="header-label__content")
+        if tag:
+            colon=tag.find(class_ ="sr-only")
+            if colon:
+                colon.extract()
+
+        # Skip articles behind paywall
+        if soup.find(id = "faz-paywall"):
+            self.abort_article()
+
+        # Remove F.A.Z. ad
+        for tag in soup.findAll(attrs={'class': 'body-elements__paragraph'}):
+            if tag.contents[0] and 'F.A.Z.-Newsletter' in tag.contents[0]:
+                tag.extract()
+
+#         format liveblog
+        if soup.find(attrs={'class':'tik4-live__container'}):
+                    format_tickaroo_liveblog(soup)
+
+# remove sizes and calc attributes in images
+        for tag in soup.findAll('img'):
+            if tag.has_attr('src'):
+                new_img = soup.new_tag('img')
+                new_img['src'] = tag['src']
+                if tag.has_attr('alt'):
+                    new_img['alt'] = tag['alt']
+                if tag.has_attr('title'):
+                    new_img['title'] = tag['title']
+                tag.replace_with(new_img)
+        return soup
+
+    # Some last cleanup
+
+    def postprocess_html(self, soup, first_fetch):
+
+        #Position point between figure caption and figure credit, where needed
+        for tag in soup.findAll(attrs={'class':['body-elements__image-figcaption','header-teaser__image-details']}):
+            if tag.string is None:
+                if tag.contents[0].string:
+                    tag=tag.contents[0]
+            if tag.string:
+                text = str(tag.string)
+                text = text.strip()
+                if text != '' and text[-1] not in ['.','?','!',':']:
+                    tag.string.replace_with(text + ".")
+        return self.adeify_images(soup)