mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Faz.net by Anonymous
This commit is contained in:
parent
5e8faec6eb
commit
fd20bf9baa
@ -22,7 +22,7 @@ class DeMorganBe(BasicNewsRecipe):
|
||||
ignore_duplicate_articles = {'url'}
|
||||
masthead_url = 'https://www.demorgen.be/_next/static/media/demorgen_logo.dce579e2.svg'
|
||||
cover_url = 'https://usercontent.one/wp/www.insidejazz.be/wp-content/uploads/2018/11/pic0143.png'
|
||||
|
||||
|
||||
extra_css = """
|
||||
time, [data-test-id:"article-label"], [data-test-id:"article-sublabel"], [[data-test-id:"article-author"]] { font-size:small; }
|
||||
[data-test-id:"header-intro"] { font-style: italic; }
|
||||
|
275
recipes/faz_net.recipe
Normal file
275
recipes/faz_net.recipe
Normal file
@ -0,0 +1,275 @@
|
||||
import json
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
|
||||
|
||||
|
||||
def format_tickaroo_liveblog(soup):
|
||||
for img in soup.findAll('img', attrs={'class':'tik4-media-image__img','srcset':True}):
|
||||
sources = img['srcset'].split()
|
||||
i=0
|
||||
for x in sources:
|
||||
if x == '960w,' or x == '960w':
|
||||
img['src'] = sources[i-1]
|
||||
break
|
||||
i = i + 1
|
||||
if not img.has_attr('src'):
|
||||
img['src'] = sources[0]
|
||||
for div in soup.findAll('div', attrs={'class':'tik4-content-block tik4-content-block--rich-text tik4-content-block--position-2'}):
|
||||
div.insert_before(soup.new_tag('br'))
|
||||
|
||||
|
||||
#format liveblogs
|
||||
for tag in soup.findAll('time'):
|
||||
ntag = soup.new_tag("br")
|
||||
tag.insert_before(ntag)
|
||||
|
||||
for tag in soup.findAll(class_ = 'tik4-author__wrapper'):
|
||||
ntag = tag.find(class_ = 'tik4-author__name')
|
||||
if ntag:
|
||||
temp = ntag.extract()
|
||||
temp['class'] = 'tik4-media-body__title'
|
||||
ntag = tag.find(class_ = 'tik4-author__thumb')
|
||||
if ntag and temp:
|
||||
ntag.insert_after(temp)
|
||||
|
||||
# process run of images
|
||||
def bilderstrecke(soup,tag):
|
||||
flag = False
|
||||
try:
|
||||
struct = json.loads(str(tag.contents[0]))
|
||||
except Exception:
|
||||
return
|
||||
|
||||
if struct and isinstance(struct, list):
|
||||
for v in struct:
|
||||
if isinstance(v, dict) and 'caption' in v:
|
||||
flag = True
|
||||
break
|
||||
if not flag:
|
||||
return
|
||||
|
||||
temp=soup.findAll(class_='header-teaser')
|
||||
if len(temp) > 1:
|
||||
temp[0].extract()
|
||||
collect = soup.new_tag('div')
|
||||
|
||||
for v in struct:
|
||||
if isinstance(v, dict) and 'caption' in v and 'defaultUrl' in v:
|
||||
# if type(struct[i-1])== str:
|
||||
# head = soup.new_tag("h4")
|
||||
# head.append(struct[i-1])
|
||||
cap = soup.new_tag('p')
|
||||
cap.append(struct[int(v['caption'])])
|
||||
cap['class'] = "body-elements__image-figcaption"
|
||||
if 'source' in v.keys():
|
||||
cred = soup.new_tag('span')
|
||||
cred.append(struct[int(v['source'])])
|
||||
cred['class'] = "body-elements__image-figcaption--source"
|
||||
cap.append(cred)
|
||||
if 'defaultUrl' in v.keys():
|
||||
fig = soup.new_tag("figure")
|
||||
img = soup.new_tag('img')
|
||||
img['src'] = struct[int(v['defaultUrl'])]
|
||||
fig.append(img)
|
||||
fig.append(cap)
|
||||
collect.append(fig)
|
||||
soup.find(class_='header-teaser').insert_after(collect)
|
||||
|
||||
|
||||
for tag in soup.findAll(class_='header-teaser__image--default'):
|
||||
tag.extract()
|
||||
|
||||
def story(soup,tag):
|
||||
first_image = soup.find('img',attrs={'loading':'lazy'})
|
||||
first_caption = soup.find('figcaption',attrs={'class':'caption'})
|
||||
if first_image and first_caption:
|
||||
first_image.insert_after(first_caption.extract())
|
||||
|
||||
|
||||
class FazNet(BasicNewsRecipe):
|
||||
# Version 9.1m
|
||||
# Update 2024-05
|
||||
# original by Armin Geller
|
||||
# overhaul to deal with changes in the faz.net websites
|
||||
|
||||
title = 'FAZ.NET'
|
||||
__author__ = 'Unknown'
|
||||
description = 'Frankfurter Allgemeine Zeitung'
|
||||
publisher = 'Frankfurter Allgemeine Zeitung GmbH'
|
||||
category = 'news, politics, Germany'
|
||||
cover_url = 'https://upload.wikimedia.org/wikipedia/commons/7/72/Frankfurter_Allgemeine_logo.svg'
|
||||
encoding = 'utf-8'
|
||||
language = 'de'
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
max_articles_per_feed = 30
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
scale_news_images = (10,100)
|
||||
delay = 1
|
||||
|
||||
test_article = 'https://www.faz.net/rss/aktuell/feuilleton/kunst-und-architektur/berlinische-galerie-zeigt-edvard-munch-die-ganze-gefuehlsskala-des-lebens-19180631.html?printPagedArticle=true#pageIndex_2'
|
||||
test_article = None
|
||||
|
||||
extra_css = '''
|
||||
.header-title,.scrolly-title {font-size: 1.5em; font-weight:bold; text-align:left;}
|
||||
.quote {font-size: 1.5em; font-weight:bold; text-align:center;}
|
||||
.author {font-size: 0.7em; font-weight:bold; text-align:center; display:block;
|
||||
margin-bottom: 0.95 em; color:grey;}
|
||||
.header-label__content {font-size: 0.7em; font-weight:bold; text-align:left; display:block;
|
||||
margin-bottom: 0.95 em; color:grey;}
|
||||
h3 {font-size:1.3em;text-align:left;}
|
||||
.caption,.body-elements__image-figcaption,.header-teaser__image-details,.tik4-media-body__title,.scrolly-text {
|
||||
margin-top:0.05em;margin-bottom:1em; font-size: 0.85em; text-align:left;}
|
||||
.body-elements__image-figcaption--source,.header-teaser__image-details--source,.tik4-media-body__credit {
|
||||
font-size: 0.65em; font-style:italic; text-align:left;margin-left:0.4em;}
|
||||
.header-detail--bold {font-size:0.6em; font-weight:bold; margin-bottom:0.75em;text-align:left;}
|
||||
time {font-size:0.6em; font-weight: normal; margin-bottom:0.75em; text-align:left; display:block;}
|
||||
.header-teaser,.scrolly-intro {font-size:1em; font-style:italic; font-weight:bold;margin-bottom:1em;}
|
||||
.tik4-media-image {margin-bottom:1em;margin-top:1em;}
|
||||
'''
|
||||
|
||||
keep_only_tags = [dict(name='article', attrs={'class':['article','storytelling']}),
|
||||
dict(name='body'),
|
||||
dict(name='div', attrs={'class':['imageGallery','image_only']}),
|
||||
dict(name = 'div', attrs ={'class':'tik4-live__container'}),
|
||||
dict(name = 'script', attrs = {'id':'__NUXT_DATA__'}),
|
||||
]
|
||||
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':[
|
||||
'related-articles','consent-placeholder',
|
||||
'article-footer content-container',
|
||||
'tik4-sharing','tik4-load-more-bottom',
|
||||
'tik4-by','header-detail__image','mm-adbox','upper-toolbar content-container'
|
||||
]}),
|
||||
# dict(name ='script'),
|
||||
dict(name = "style"),
|
||||
dict(name='svg'),
|
||||
dict(name='div', attrs={'data-module':'teaser'}),
|
||||
|
||||
]
|
||||
|
||||
remove_attributes = ['onclick']
|
||||
|
||||
|
||||
test_article = False
|
||||
if not test_article:
|
||||
feeds = [
|
||||
('FAZ.NET Aktuell', 'https://www.faz.net/rss/aktuell/'),
|
||||
('Politik', 'https://www.faz.net/rss/aktuell/politik/'),
|
||||
('Wirtschaft', 'https://www.faz.net/rss/aktuell/wirtschaft/'),
|
||||
('Feuilleton', 'https://www.faz.net/rss/aktuell/feuilleton/'),
|
||||
('Sport', 'https://www.faz.net/rss/aktuell/sport/'),
|
||||
('Lebensstil', 'https://www.faz.net/rss/aktuell/lebensstil/'),
|
||||
('Gesellschaft', 'https://www.faz.net/rss/aktuell/gesellschaft/'),
|
||||
('Finanzen', 'https://www.faz.net/rss/aktuell/finanzen/'),
|
||||
('Technik & Motor', 'https://www.faz.net/rss/aktuell/technik-motor/'),
|
||||
('Wissen', 'https://www.faz.net/rss/aktuell/wissen/'),
|
||||
('Reise', 'https://www.faz.net/rss/aktuell/reise/'),
|
||||
('Karriere & Hochschule', 'https://www.faz.net/rss/aktuell/karriere-hochschule/'),
|
||||
('Rhein-Main', 'https://www.faz.net/rss/aktuell/rhein-main/')
|
||||
]
|
||||
else:
|
||||
def parse_index(self):
|
||||
test_article = 'https://www.faz.net/aktuell/stil/mode-im-em-jahr-wir-zeigen-wie-fussball-und-mode-zusammengehoeren-19766969.html'
|
||||
# test_article = 'https://www.faz.net/aktuell/feuilleton/buecher/film-eruption-ein-thriller-aus-dem-nachlass-von-michael-crichton-19770491.html'
|
||||
# test_article = 'https://www.faz.net/aktuell/stil/mode-design/leonie-benesch-sandra-hueller-ist-eine-meiner-heldinnen-19671638.html'
|
||||
# test_article = 'https://www.faz.net/aktuell/feuilleton/medien/sabine-postel-zum-siebzigsten-die-briten-nannten-sie-german-traktor-19708409.html'
|
||||
# test_article = 'https://www.faz.net/aktuell/stil/mode-design/von-richert-beil-bis-william-fan-wer-kauft-denn-das-19666592.html'
|
||||
# test_article = 'https://www.faz.net/aktuell/feuilleton/buecher/rezensionen/sachbuch/tom-mustills-buch-die-sprache-der-wale-19657782.html'
|
||||
if test_article:
|
||||
return [('Articles', [{'title': 'Test article', 'url': test_article}])]
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
img = soup.find(**prefixed_classes('IssueDescription_cover__'))
|
||||
if img is not None:
|
||||
self.cover_url = img['src']
|
||||
current_section, current_articles = 'Cover Story', []
|
||||
feeds = []
|
||||
for x in soup.findAll(**prefixed_classes('TocFeaturedSection_heading__ TocSection_heading__ TocHeroGridItem_hedLink___ TocGridItem_hedLink__')):
|
||||
cls = x['class']
|
||||
if not isinstance(cls, str):
|
||||
cls = ' '.join(cls)
|
||||
title = self.tag_to_string(x).strip()
|
||||
if 'Section' in cls:
|
||||
if current_articles:
|
||||
feeds.append((current_section, current_articles))
|
||||
current_section, current_articles = title, []
|
||||
self.log(current_section)
|
||||
continue
|
||||
url = x['href']
|
||||
current_articles.append({'title': title, 'url': url})
|
||||
self.log('\t', title, url)
|
||||
if current_articles:
|
||||
feeds.append((current_section, current_articles))
|
||||
return feeds
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
# Format story-type article
|
||||
tag = soup.find(class_='storyContainer')
|
||||
if tag:
|
||||
story(soup,tag)
|
||||
|
||||
#Extract images and text from image galleries
|
||||
for par in soup.findAll('p'):
|
||||
if len(par.contents) == 1:
|
||||
cont = str(par.contents[0])
|
||||
if re.search(r"^[1-9]\d* Bilder$",cont):
|
||||
# print(cont)
|
||||
for tag in soup.findAll('script',attrs={'id':"__NUXT_DATA__",'type':'application/json'}):
|
||||
bilderstrecke(soup,tag)
|
||||
break
|
||||
break
|
||||
|
||||
# unwrap buttons
|
||||
for tag in soup.findAll('button'):
|
||||
tag.unwrap()
|
||||
|
||||
# remove ":""
|
||||
tag = soup.find(class_ ="header-label__content")
|
||||
if tag:
|
||||
colon=tag.find(class_ ="sr-only")
|
||||
if colon:
|
||||
colon.extract()
|
||||
|
||||
# Skip articles behind paywall
|
||||
if soup.find(id = "faz-paywall"):
|
||||
self.abort_article()
|
||||
|
||||
# Remove F.A.Z. ad
|
||||
for tag in soup.findAll(attrs={'class': 'body-elements__paragraph'}):
|
||||
if tag.contents[0] and 'F.A.Z.-Newsletter' in tag.contents[0]:
|
||||
tag.extract()
|
||||
|
||||
# format liveblog
|
||||
if soup.find(attrs={'class':'tik4-live__container'}):
|
||||
format_tickaroo_liveblog(soup)
|
||||
|
||||
# remove sizes and calc attributes in images
|
||||
for tag in soup.findAll('img'):
|
||||
if tag.has_attr('src'):
|
||||
new_img = soup.new_tag('img')
|
||||
new_img['src'] = tag['src']
|
||||
if tag.has_attr('alt'):
|
||||
new_img['alt'] = tag['alt']
|
||||
if tag.has_attr('title'):
|
||||
new_img['title'] = tag['title']
|
||||
tag.replace_with(new_img)
|
||||
return soup
|
||||
|
||||
# Some last cleanup
|
||||
|
||||
def postprocess_html(self, soup, first_fetch):
|
||||
|
||||
#Position point between figure caption and figure credit, where needed
|
||||
for tag in soup.findAll(attrs={'class':['body-elements__image-figcaption','header-teaser__image-details']}):
|
||||
if tag.string is None:
|
||||
if tag.contents[0].string:
|
||||
tag=tag.contents[0]
|
||||
if tag.string:
|
||||
text = str(tag.string)
|
||||
text = text.strip()
|
||||
if text != '' and text[-1] not in ['.','?','!',':']:
|
||||
tag.string.replace_with(text + ".")
|
||||
return self.adeify_images(soup)
|
Loading…
x
Reference in New Issue
Block a user