mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Added Parool (NL), Revista 22 (RO), Dilema (RO) and Internazionale (IT); updated Volkskrant to download cover
This commit is contained in:
parent
c581bad34e
commit
24befe49bb
BIN
recipes/dilema.png
Normal file
BIN
recipes/dilema.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.2 KiB |
107
recipes/dilema.recipe
Normal file
107
recipes/dilema.recipe
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
class Volkskrant(BasicNewsRecipe):
|
||||||
|
title = 'Dilema'
|
||||||
|
__author__ = 'Cristi Ghera'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
description = '"Sint vechi, domnule!" (I.L. Caragiale)'
|
||||||
|
needs_subscription = False
|
||||||
|
language = 'ro'
|
||||||
|
country = 'RO'
|
||||||
|
category = 'politics, culture, Romania'
|
||||||
|
resolve_internal_links = True
|
||||||
|
remove_tags_before = { 'class': 'post' }
|
||||||
|
remove_tags_after = { 'class': 'post_content' }
|
||||||
|
remove_tags = [
|
||||||
|
dict(
|
||||||
|
attrs={
|
||||||
|
'class': [
|
||||||
|
'single_meta_category',
|
||||||
|
'avatar',
|
||||||
|
'jm-post-like',
|
||||||
|
'fa',
|
||||||
|
]
|
||||||
|
}
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
name=['div'],
|
||||||
|
attrs={
|
||||||
|
'class': ['mb-2']
|
||||||
|
}
|
||||||
|
),
|
||||||
|
dict(id=['like', 'dlik']),
|
||||||
|
dict(name=['script', 'noscript', 'style']),
|
||||||
|
]
|
||||||
|
remove_attributes = ["class", "id", "name", "style"]
|
||||||
|
encoding = 'utf-8'
|
||||||
|
no_stylesheets = True
|
||||||
|
ignore_duplicate_articles = {'url'}
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
homepage_url = 'https://www.dilema.ro/'
|
||||||
|
soup = self.index_to_soup(homepage_url)
|
||||||
|
|
||||||
|
articles = []
|
||||||
|
|
||||||
|
# .banner-container
|
||||||
|
banner_container = soup.find('div', attrs={'class': 'banner-container'})
|
||||||
|
container = banner_container.find('h5')
|
||||||
|
a = container.find('a')
|
||||||
|
url = homepage_url + a.attrs['href']
|
||||||
|
articles.append(
|
||||||
|
dict(
|
||||||
|
title=self.tag_to_string(container).strip(),
|
||||||
|
url=url,
|
||||||
|
date=self.tag_to_string(banner_container.find(attrs={'class': 'post-date'})).strip(),
|
||||||
|
description='',
|
||||||
|
content=''
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# .homepage_builder_3grid_post
|
||||||
|
containers = soup.findAll('div', attrs={'class': 'homepage_builder_3grid_post'})
|
||||||
|
for container in containers:
|
||||||
|
if self.tag_to_string(container.find('h2')) in ['CELE MAI RECENTE', 'CELE MAI CITITE']:
|
||||||
|
continue
|
||||||
|
for article in container.findAll('div', attrs={'class': 'blog_grid_post_style'}):
|
||||||
|
title_container = article.find('h3')
|
||||||
|
if not title_container:
|
||||||
|
continue
|
||||||
|
url = title_container.find('a')['href']
|
||||||
|
url = homepage_url + url
|
||||||
|
article_title = self.tag_to_string(title_container).strip()
|
||||||
|
author = self.tag_to_string(
|
||||||
|
article.find('a', attrs={'rel': 'author'})
|
||||||
|
).strip()
|
||||||
|
summary = self.tag_to_string(article.find('p')).strip()
|
||||||
|
pubdate = self.tag_to_string(article.find(attrs={'class': 'post-date'}))
|
||||||
|
description = author + ' - ' + summary
|
||||||
|
articles.append(
|
||||||
|
dict(
|
||||||
|
title=article_title,
|
||||||
|
url=url,
|
||||||
|
date=pubdate,
|
||||||
|
description=description,
|
||||||
|
content=''
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
sections = [("Numărul curent", articles)]
|
||||||
|
return sections
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
main_carousel = soup.find(attrs={'id': 'main-carousel'})
|
||||||
|
if main_carousel:
|
||||||
|
img = main_carousel.find('img')
|
||||||
|
body = soup.find('body')
|
||||||
|
body.clear()
|
||||||
|
body.append(img)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
url = 'https://www.dilema.ro/coperta-saptaminii/'
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
|
img = soup.find(attrs={'id': 'main-carousel'}).find('img')
|
||||||
|
return url + img.attrs['src']
|
BIN
recipes/internazionale.png
Normal file
BIN
recipes/internazionale.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.6 KiB |
117
recipes/internazionale.recipe
Normal file
117
recipes/internazionale.recipe
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Volkskrant(BasicNewsRecipe):
|
||||||
|
title = 'Internazionale'
|
||||||
|
__author__ = 'Cristi Ghera'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
description = 'Internazionale - Notizie dall’Italia e dal mondo'
|
||||||
|
needs_subscription = False
|
||||||
|
language = 'it'
|
||||||
|
country = 'IT'
|
||||||
|
category = 'news, politics, Italy, world'
|
||||||
|
resolve_internal_links = True
|
||||||
|
remove_tags_before = { 'name': 'article' }
|
||||||
|
remove_tags_after = { 'name': 'article' }
|
||||||
|
remove_tags = [
|
||||||
|
dict(
|
||||||
|
attrs={
|
||||||
|
'class': [
|
||||||
|
'item-banner',
|
||||||
|
'hentryfeed__side',
|
||||||
|
'magazine-article-share-tools',
|
||||||
|
'magazine-article-share-popup',
|
||||||
|
'article_next',
|
||||||
|
'cta_nl_ext_container',
|
||||||
|
]
|
||||||
|
}
|
||||||
|
),
|
||||||
|
dict(name=['script', 'style']),
|
||||||
|
]
|
||||||
|
remove_attributes = ["class", "id", "name", "style"]
|
||||||
|
encoding = 'utf-8'
|
||||||
|
no_stylesheets = True
|
||||||
|
ignore_duplicate_articles = {'url'}
|
||||||
|
|
||||||
|
current_number_url = "https://www.internazionale.it/sommario"
|
||||||
|
home_url = "https://www.internazionale.it"
|
||||||
|
cover_url = None
|
||||||
|
|
||||||
|
def extract_article(self, article):
|
||||||
|
url = article.find('a')['href']
|
||||||
|
if url[0] == '/':
|
||||||
|
url = self.home_url + url
|
||||||
|
title_parts = []
|
||||||
|
tag = article.find('div', {'class': 'abstract-article__tag'})
|
||||||
|
if tag: title_parts.append(self.tag_to_string(tag).upper())
|
||||||
|
title_parts.append(self.tag_to_string(article.find('div', {'class': 'abstract-article__title'})))
|
||||||
|
article_title = ' \u2022 '.join(title_parts)
|
||||||
|
pubdate=''
|
||||||
|
description_parts = []
|
||||||
|
author = article.find('div', {'class': 'abstract-article__author'})
|
||||||
|
if author: description_parts.append(self.tag_to_string(author))
|
||||||
|
summary = article.find('div', {'class': 'abstract-article__content'})
|
||||||
|
if summary: description_parts.append(self.tag_to_string(summary))
|
||||||
|
description = ' \u2022 '.join(description_parts)
|
||||||
|
return dict(
|
||||||
|
title=article_title,
|
||||||
|
url=url,
|
||||||
|
date=pubdate,
|
||||||
|
description=description,
|
||||||
|
content=''
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup(self.current_number_url)
|
||||||
|
self.cover_url = soup.find('span', { 'class': 'img_expand' })['data-src']
|
||||||
|
main_container = soup.find('div', { 'class': 'content_data' })
|
||||||
|
children = main_container.findAll('div', recursive=False)
|
||||||
|
sections = []
|
||||||
|
current_section = None
|
||||||
|
for container in children:
|
||||||
|
if 'abstract-testatina' in container['class'] or 'abstract-testatina-cultura' in container['class']:
|
||||||
|
if current_section:
|
||||||
|
sections.append(current_section)
|
||||||
|
current_section = (self.tag_to_string(container), [])
|
||||||
|
continue
|
||||||
|
|
||||||
|
if 'masonry-items' in container['class']:
|
||||||
|
for article in container.findAll('div', {'class': 'abstract-article'}):
|
||||||
|
current_section[1].append(self.extract_article(article))
|
||||||
|
continue
|
||||||
|
|
||||||
|
if 'abstract-article' in container['class']:
|
||||||
|
current_section[1].append(self.extract_article(container))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# print(container['class'])
|
||||||
|
if current_section:
|
||||||
|
sections.append(current_section)
|
||||||
|
return sections
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for node in soup.findAll('figure'):
|
||||||
|
img_src = None
|
||||||
|
image_attributes = [
|
||||||
|
'data-media1024',
|
||||||
|
'data-media1025',
|
||||||
|
'data-media641',
|
||||||
|
'data-media321',
|
||||||
|
'data-media',
|
||||||
|
]
|
||||||
|
for attr in image_attributes:
|
||||||
|
if node.has_attr(attr):
|
||||||
|
img_src = node[attr]
|
||||||
|
break
|
||||||
|
node.name = 'div'
|
||||||
|
if img_src:
|
||||||
|
img = soup.new_tag('img', src=img_src)
|
||||||
|
node.insert(0, img)
|
||||||
|
for node in soup.findAll('figcaption'):
|
||||||
|
node.name = 'div'
|
||||||
|
# if self.browser.cookiejar:
|
||||||
|
# self.browser.cookiejar.clear()
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
return self.cover_url
|
BIN
recipes/parool.png
Normal file
BIN
recipes/parool.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 2.6 KiB |
96
recipes/parool.recipe
Normal file
96
recipes/parool.recipe
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
import uuid
|
||||||
|
from mechanize import Request
|
||||||
|
from contextlib import closing
|
||||||
|
import json
|
||||||
|
|
||||||
|
class Parool(BasicNewsRecipe):
|
||||||
|
title = 'Het Parool'
|
||||||
|
__author__ = 'Cristi Ghera'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
description = 'Het Parool - Vrij, Onverveerd'
|
||||||
|
needs_subscription = False
|
||||||
|
language = 'nl'
|
||||||
|
country = 'NL'
|
||||||
|
category = 'news, politics, Netherlands'
|
||||||
|
resolve_internal_links = True
|
||||||
|
remove_tags_before = dict(id='main-content')
|
||||||
|
remove_tags_after = dict(id='main-content')
|
||||||
|
remove_tags = [
|
||||||
|
dict(attrs={'class':['article-footer__sharing', 'artstyle__editorial-tips', 'artstyle__advertisement','artstyle__container__icon','artstyle__disabled-embed','container__title__icon',]}),
|
||||||
|
dict(attrs={'data-element-id': ['article-element-authors']}),
|
||||||
|
dict(name=['script', 'noscript', 'style']),
|
||||||
|
]
|
||||||
|
remove_attributes = ["class", "id", "name", "style"]
|
||||||
|
encoding = 'utf-8'
|
||||||
|
no_stylesheets = True
|
||||||
|
ignore_duplicate_articles = {'url'}
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup('https://www.parool.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4()))
|
||||||
|
containers = soup.findAll('section', attrs={'class': 'section--horizontal'})
|
||||||
|
sections = []
|
||||||
|
for container in containers:
|
||||||
|
section_title = self.tag_to_string(container.find('h2')).strip()
|
||||||
|
articles = []
|
||||||
|
|
||||||
|
for art in container.findAll('article'):
|
||||||
|
a = art.find('a')
|
||||||
|
url = a['href']
|
||||||
|
if url[0] == '/':
|
||||||
|
url = 'https://www.parool.nl' + url
|
||||||
|
if '/editie/' not in url:
|
||||||
|
continue
|
||||||
|
header = a.find('header')
|
||||||
|
teaser_label = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__label'})).strip()
|
||||||
|
teaser_sublabel = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__sublabel'})).strip()
|
||||||
|
teaser_title = self.tag_to_string(header.find('h3').find('span', attrs={'class': 'teaser__title__value--short'})).strip()
|
||||||
|
ignore = { "dirkjan", "s1ngle", "pukkels", "hein de kort" }
|
||||||
|
if teaser_label.lower() in ignore:
|
||||||
|
continue
|
||||||
|
parts = []
|
||||||
|
if teaser_label:
|
||||||
|
parts.append(teaser_label.upper())
|
||||||
|
if teaser_sublabel:
|
||||||
|
parts.append(teaser_sublabel)
|
||||||
|
if teaser_title:
|
||||||
|
parts.append(teaser_title)
|
||||||
|
article_title = ' \u2022 '.join(parts)
|
||||||
|
articles.append(dict(title=article_title,
|
||||||
|
url=url,
|
||||||
|
content=''))
|
||||||
|
|
||||||
|
sections.append((section_title, articles))
|
||||||
|
return sections
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for tag in soup():
|
||||||
|
if tag.name == 'img':
|
||||||
|
if tag['src'][0] == '/':
|
||||||
|
tag['src'] = 'https://www.parool.nl' + tag['src']
|
||||||
|
for tag in soup():
|
||||||
|
if tag.name == "picture":
|
||||||
|
tag.replaceWith(tag.find("img"))
|
||||||
|
comic_articles = {
|
||||||
|
"Alle strips van Dirkjan",
|
||||||
|
"S1NGLE",
|
||||||
|
"Pukkels",
|
||||||
|
"Bekijk hier alle cartoons van Hein de Kort",
|
||||||
|
}
|
||||||
|
if self.tag_to_string(soup.find('h1')).strip() in comic_articles:
|
||||||
|
for node in soup.find('figure').find_next_siblings():
|
||||||
|
node.extract()
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
headers = {
|
||||||
|
'X-Requested-With': 'XMLHttpRequest',
|
||||||
|
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
||||||
|
'DNT': '1',
|
||||||
|
}
|
||||||
|
url = "https://login-api.e-pages.dk/v1/krant.parool.nl/folders"
|
||||||
|
with closing(self.browser.open(Request(url, None, headers))) as r:
|
||||||
|
folders = json.loads(r.read())
|
||||||
|
return folders["objects"][0]["teaser_medium"]
|
||||||
|
return None
|
BIN
recipes/revista22.png
Normal file
BIN
recipes/revista22.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 3.7 KiB |
74
recipes/revista22.recipe
Normal file
74
recipes/revista22.recipe
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Volkskrant(BasicNewsRecipe):
|
||||||
|
title = 'Revista 22'
|
||||||
|
__author__ = 'Cristi Ghera'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
description = 'Revista 22'
|
||||||
|
needs_subscription = False
|
||||||
|
language = 'ro'
|
||||||
|
country = 'RO'
|
||||||
|
category = 'news, politics, Romania'
|
||||||
|
resolve_internal_links = True
|
||||||
|
remove_tags_before = { 'class': 'col-span-8' }
|
||||||
|
remove_tags_after = { 'class': 'col-span-8' }
|
||||||
|
remove_tags = [
|
||||||
|
dict(
|
||||||
|
attrs={
|
||||||
|
'class': [
|
||||||
|
'icons',
|
||||||
|
'float-left',
|
||||||
|
'samesection',
|
||||||
|
]
|
||||||
|
}
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
name=['div'],
|
||||||
|
attrs={
|
||||||
|
'class': ['mb-2']
|
||||||
|
}
|
||||||
|
),
|
||||||
|
dict(id=['comments']),
|
||||||
|
dict(name=['script', 'noscript', 'style']),
|
||||||
|
]
|
||||||
|
remove_attributes = ["class", "id", "name", "style"]
|
||||||
|
encoding = 'utf-8'
|
||||||
|
no_stylesheets = True
|
||||||
|
ignore_duplicate_articles = {'url'}
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup('https://revista22.ro')
|
||||||
|
url = soup.find('div', attrs={'class': 'uppercase'}).find('a').attrs['href']
|
||||||
|
if url[0] == '/':
|
||||||
|
url = 'https://revista22.ro' + url
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
|
main_container = soup.find('div', attrs={'class': 'col-span-8'})
|
||||||
|
containers = main_container.findAll(attrs={'class': 'mb-4'})
|
||||||
|
articles = []
|
||||||
|
for container in containers:
|
||||||
|
if 'pb-4' not in container.attrs['class']:
|
||||||
|
continue
|
||||||
|
a = container.find('a')
|
||||||
|
url = a['href']
|
||||||
|
if url[0] == '/':
|
||||||
|
url = 'https://revista22.ro' + url
|
||||||
|
article_title = self.tag_to_string(a.find('h3')).strip()
|
||||||
|
author = self.tag_to_string(
|
||||||
|
container.find('span', attrs={'class': 'text-red'})
|
||||||
|
).strip()
|
||||||
|
summary = self.tag_to_string(container.find('p')).strip()
|
||||||
|
pubdate = self.tag_to_string(a.find('span'))
|
||||||
|
description = author + ' - ' + summary
|
||||||
|
articles.append(
|
||||||
|
dict(
|
||||||
|
title=article_title,
|
||||||
|
url=url,
|
||||||
|
date=pubdate,
|
||||||
|
description=description,
|
||||||
|
content=''
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
sections = [('Numărul curent', articles)]
|
||||||
|
return sections
|
BIN
recipes/volkskrant.png
Normal file
BIN
recipes/volkskrant.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 12 KiB |
@ -1,8 +1,9 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import uuid
|
|
||||||
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
import uuid
|
||||||
|
from mechanize import Request
|
||||||
|
from contextlib import closing
|
||||||
|
import json
|
||||||
|
|
||||||
class Volkskrant(BasicNewsRecipe):
|
class Volkskrant(BasicNewsRecipe):
|
||||||
title = 'Volkskrant'
|
title = 'Volkskrant'
|
||||||
@ -95,4 +96,25 @@ class Volkskrant(BasicNewsRecipe):
|
|||||||
if tag.name == 'img':
|
if tag.name == 'img':
|
||||||
if tag['src'][0] == '/':
|
if tag['src'][0] == '/':
|
||||||
tag['src'] = 'https://www.volkskrant.nl' + tag['src']
|
tag['src'] = 'https://www.volkskrant.nl' + tag['src']
|
||||||
|
|
||||||
|
for tag in soup():
|
||||||
|
if tag.name == "picture":
|
||||||
|
tag.replaceWith(tag.find("img"))
|
||||||
|
|
||||||
|
comic_articles = { "Bas van der Schot", "Poldermodellen", "Gummbah", "Sigmund" }
|
||||||
|
if self.tag_to_string(soup.find('h1')).strip() in comic_articles:
|
||||||
|
for node in soup.find('figure').find_next_siblings():
|
||||||
|
node.extract()
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
headers = {
|
||||||
|
'X-Requested-With': 'XMLHttpRequest',
|
||||||
|
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
||||||
|
'DNT': '1',
|
||||||
|
}
|
||||||
|
url = "https://login-api.e-pages.dk/v1/krant.volkskrant.nl/folders"
|
||||||
|
with closing(self.browser.open(Request(url, None, headers))) as r:
|
||||||
|
folders = json.loads(r.read())
|
||||||
|
return folders["objects"][0]["teaser_medium"]
|
||||||
|
return None
|
Loading…
x
Reference in New Issue
Block a user