diff --git a/recipes/dilema.png b/recipes/dilema.png new file mode 100644 index 0000000000..907f122ee1 Binary files /dev/null and b/recipes/dilema.png differ diff --git a/recipes/dilema.recipe b/recipes/dilema.recipe new file mode 100644 index 0000000000..c12280366e --- /dev/null +++ b/recipes/dilema.recipe @@ -0,0 +1,107 @@ +#!/usr/bin/env python +from calibre.web.feeds.recipes import BasicNewsRecipe +from bs4 import BeautifulSoup + +class Volkskrant(BasicNewsRecipe): + title = 'Dilema' + __author__ = 'Cristi Ghera' + max_articles_per_feed = 100 + description = '"Sint vechi, domnule!" (I.L. Caragiale)' + needs_subscription = False + language = 'ro' + country = 'RO' + category = 'politics, culture, Romania' + resolve_internal_links = True + remove_tags_before = { 'class': 'post' } + remove_tags_after = { 'class': 'post_content' } + remove_tags = [ + dict( + attrs={ + 'class': [ + 'single_meta_category', + 'avatar', + 'jm-post-like', + 'fa', + ] + } + ), + dict( + name=['div'], + attrs={ + 'class': ['mb-2'] + } + ), + dict(id=['like', 'dlik']), + dict(name=['script', 'noscript', 'style']), + ] + remove_attributes = ["class", "id", "name", "style"] + encoding = 'utf-8' + no_stylesheets = True + ignore_duplicate_articles = {'url'} + + def parse_index(self): + homepage_url = 'https://www.dilema.ro/' + soup = self.index_to_soup(homepage_url) + + articles = [] + + # .banner-container + banner_container = soup.find('div', attrs={'class': 'banner-container'}) + container = banner_container.find('h5') + a = container.find('a') + url = homepage_url + a.attrs['href'] + articles.append( + dict( + title=self.tag_to_string(container).strip(), + url=url, + date=self.tag_to_string(banner_container.find(attrs={'class': 'post-date'})).strip(), + description='', + content='' + ) + ) + + # .homepage_builder_3grid_post + containers = soup.findAll('div', attrs={'class': 'homepage_builder_3grid_post'}) + for container in containers: + if self.tag_to_string(container.find('h2')) in ['CELE MAI RECENTE', 'CELE MAI CITITE']: + continue + for article in container.findAll('div', attrs={'class': 'blog_grid_post_style'}): + title_container = article.find('h3') + if not title_container: + continue + url = title_container.find('a')['href'] + url = homepage_url + url + article_title = self.tag_to_string(title_container).strip() + author = self.tag_to_string( + article.find('a', attrs={'rel': 'author'}) + ).strip() + summary = self.tag_to_string(article.find('p')).strip() + pubdate = self.tag_to_string(article.find(attrs={'class': 'post-date'})) + description = author + ' - ' + summary + articles.append( + dict( + title=article_title, + url=url, + date=pubdate, + description=description, + content='' + ) + ) + + sections = [("Numărul curent", articles)] + return sections + + def preprocess_html(self, soup): + main_carousel = soup.find(attrs={'id': 'main-carousel'}) + if main_carousel: + img = main_carousel.find('img') + body = soup.find('body') + body.clear() + body.append(img) + return soup + + def get_cover_url(self): + url = 'https://www.dilema.ro/coperta-saptaminii/' + soup = self.index_to_soup(url) + img = soup.find(attrs={'id': 'main-carousel'}).find('img') + return url + img.attrs['src'] \ No newline at end of file diff --git a/recipes/internazionale.png b/recipes/internazionale.png new file mode 100644 index 0000000000..b7dce84cfd Binary files /dev/null and b/recipes/internazionale.png differ diff --git a/recipes/internazionale.recipe b/recipes/internazionale.recipe new file mode 100644 index 0000000000..52094ed9a4 --- /dev/null +++ b/recipes/internazionale.recipe @@ -0,0 +1,117 @@ +#!/usr/bin/env python +from calibre.web.feeds.recipes import BasicNewsRecipe + +class Volkskrant(BasicNewsRecipe): + title = 'Internazionale' + __author__ = 'Cristi Ghera' + max_articles_per_feed = 100 + description = 'Internazionale - Notizie dall’Italia e dal mondo' + needs_subscription = False + language = 'it' + country = 'IT' + category = 'news, politics, Italy, world' + resolve_internal_links = True + remove_tags_before = { 'name': 'article' } + remove_tags_after = { 'name': 'article' } + remove_tags = [ + dict( + attrs={ + 'class': [ + 'item-banner', + 'hentryfeed__side', + 'magazine-article-share-tools', + 'magazine-article-share-popup', + 'article_next', + 'cta_nl_ext_container', + ] + } + ), + dict(name=['script', 'style']), + ] + remove_attributes = ["class", "id", "name", "style"] + encoding = 'utf-8' + no_stylesheets = True + ignore_duplicate_articles = {'url'} + + current_number_url = "https://www.internazionale.it/sommario" + home_url = "https://www.internazionale.it" + cover_url = None + + def extract_article(self, article): + url = article.find('a')['href'] + if url[0] == '/': + url = self.home_url + url + title_parts = [] + tag = article.find('div', {'class': 'abstract-article__tag'}) + if tag: title_parts.append(self.tag_to_string(tag).upper()) + title_parts.append(self.tag_to_string(article.find('div', {'class': 'abstract-article__title'}))) + article_title = ' \u2022 '.join(title_parts) + pubdate='' + description_parts = [] + author = article.find('div', {'class': 'abstract-article__author'}) + if author: description_parts.append(self.tag_to_string(author)) + summary = article.find('div', {'class': 'abstract-article__content'}) + if summary: description_parts.append(self.tag_to_string(summary)) + description = ' \u2022 '.join(description_parts) + return dict( + title=article_title, + url=url, + date=pubdate, + description=description, + content='' + ) + + def parse_index(self): + soup = self.index_to_soup(self.current_number_url) + self.cover_url = soup.find('span', { 'class': 'img_expand' })['data-src'] + main_container = soup.find('div', { 'class': 'content_data' }) + children = main_container.findAll('div', recursive=False) + sections = [] + current_section = None + for container in children: + if 'abstract-testatina' in container['class'] or 'abstract-testatina-cultura' in container['class']: + if current_section: + sections.append(current_section) + current_section = (self.tag_to_string(container), []) + continue + + if 'masonry-items' in container['class']: + for article in container.findAll('div', {'class': 'abstract-article'}): + current_section[1].append(self.extract_article(article)) + continue + + if 'abstract-article' in container['class']: + current_section[1].append(self.extract_article(container)) + continue + + # print(container['class']) + if current_section: + sections.append(current_section) + return sections + + def preprocess_html(self, soup): + for node in soup.findAll('figure'): + img_src = None + image_attributes = [ + 'data-media1024', + 'data-media1025', + 'data-media641', + 'data-media321', + 'data-media', + ] + for attr in image_attributes: + if node.has_attr(attr): + img_src = node[attr] + break + node.name = 'div' + if img_src: + img = soup.new_tag('img', src=img_src) + node.insert(0, img) + for node in soup.findAll('figcaption'): + node.name = 'div' + # if self.browser.cookiejar: + # self.browser.cookiejar.clear() + return soup + + def get_cover_url(self): + return self.cover_url \ No newline at end of file diff --git a/recipes/parool.png b/recipes/parool.png new file mode 100644 index 0000000000..ef1fd6252f Binary files /dev/null and b/recipes/parool.png differ diff --git a/recipes/parool.recipe b/recipes/parool.recipe new file mode 100644 index 0000000000..3a6bfa408a --- /dev/null +++ b/recipes/parool.recipe @@ -0,0 +1,96 @@ +#!/usr/bin/env python +from calibre.web.feeds.recipes import BasicNewsRecipe +import uuid +from mechanize import Request +from contextlib import closing +import json + +class Parool(BasicNewsRecipe): + title = 'Het Parool' + __author__ = 'Cristi Ghera' + max_articles_per_feed = 100 + description = 'Het Parool - Vrij, Onverveerd' + needs_subscription = False + language = 'nl' + country = 'NL' + category = 'news, politics, Netherlands' + resolve_internal_links = True + remove_tags_before = dict(id='main-content') + remove_tags_after = dict(id='main-content') + remove_tags = [ + dict(attrs={'class':['article-footer__sharing', 'artstyle__editorial-tips', 'artstyle__advertisement','artstyle__container__icon','artstyle__disabled-embed','container__title__icon',]}), + dict(attrs={'data-element-id': ['article-element-authors']}), + dict(name=['script', 'noscript', 'style']), + ] + remove_attributes = ["class", "id", "name", "style"] + encoding = 'utf-8' + no_stylesheets = True + ignore_duplicate_articles = {'url'} + + def parse_index(self): + soup = self.index_to_soup('https://www.parool.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4())) + containers = soup.findAll('section', attrs={'class': 'section--horizontal'}) + sections = [] + for container in containers: + section_title = self.tag_to_string(container.find('h2')).strip() + articles = [] + + for art in container.findAll('article'): + a = art.find('a') + url = a['href'] + if url[0] == '/': + url = 'https://www.parool.nl' + url + if '/editie/' not in url: + continue + header = a.find('header') + teaser_label = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__label'})).strip() + teaser_sublabel = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__sublabel'})).strip() + teaser_title = self.tag_to_string(header.find('h3').find('span', attrs={'class': 'teaser__title__value--short'})).strip() + ignore = { "dirkjan", "s1ngle", "pukkels", "hein de kort" } + if teaser_label.lower() in ignore: + continue + parts = [] + if teaser_label: + parts.append(teaser_label.upper()) + if teaser_sublabel: + parts.append(teaser_sublabel) + if teaser_title: + parts.append(teaser_title) + article_title = ' \u2022 '.join(parts) + articles.append(dict(title=article_title, + url=url, + content='')) + + sections.append((section_title, articles)) + return sections + + def preprocess_html(self, soup): + for tag in soup(): + if tag.name == 'img': + if tag['src'][0] == '/': + tag['src'] = 'https://www.parool.nl' + tag['src'] + for tag in soup(): + if tag.name == "picture": + tag.replaceWith(tag.find("img")) + comic_articles = { + "Alle strips van Dirkjan", + "S1NGLE", + "Pukkels", + "Bekijk hier alle cartoons van Hein de Kort", + } + if self.tag_to_string(soup.find('h1')).strip() in comic_articles: + for node in soup.find('figure').find_next_siblings(): + node.extract() + return soup + + def get_cover_url(self): + headers = { + 'X-Requested-With': 'XMLHttpRequest', + 'Accept': 'application/json, text/javascript, */*; q=0.01', + 'DNT': '1', + } + url = "https://login-api.e-pages.dk/v1/krant.parool.nl/folders" + with closing(self.browser.open(Request(url, None, headers))) as r: + folders = json.loads(r.read()) + return folders["objects"][0]["teaser_medium"] + return None \ No newline at end of file diff --git a/recipes/revista22.png b/recipes/revista22.png new file mode 100644 index 0000000000..8a6e568c75 Binary files /dev/null and b/recipes/revista22.png differ diff --git a/recipes/revista22.recipe b/recipes/revista22.recipe new file mode 100644 index 0000000000..6f91d2bc00 --- /dev/null +++ b/recipes/revista22.recipe @@ -0,0 +1,74 @@ +#!/usr/bin/env python +from calibre.web.feeds.recipes import BasicNewsRecipe + +class Volkskrant(BasicNewsRecipe): + title = 'Revista 22' + __author__ = 'Cristi Ghera' + max_articles_per_feed = 100 + description = 'Revista 22' + needs_subscription = False + language = 'ro' + country = 'RO' + category = 'news, politics, Romania' + resolve_internal_links = True + remove_tags_before = { 'class': 'col-span-8' } + remove_tags_after = { 'class': 'col-span-8' } + remove_tags = [ + dict( + attrs={ + 'class': [ + 'icons', + 'float-left', + 'samesection', + ] + } + ), + dict( + name=['div'], + attrs={ + 'class': ['mb-2'] + } + ), + dict(id=['comments']), + dict(name=['script', 'noscript', 'style']), + ] + remove_attributes = ["class", "id", "name", "style"] + encoding = 'utf-8' + no_stylesheets = True + ignore_duplicate_articles = {'url'} + + def parse_index(self): + soup = self.index_to_soup('https://revista22.ro') + url = soup.find('div', attrs={'class': 'uppercase'}).find('a').attrs['href'] + if url[0] == '/': + url = 'https://revista22.ro' + url + soup = self.index_to_soup(url) + main_container = soup.find('div', attrs={'class': 'col-span-8'}) + containers = main_container.findAll(attrs={'class': 'mb-4'}) + articles = [] + for container in containers: + if 'pb-4' not in container.attrs['class']: + continue + a = container.find('a') + url = a['href'] + if url[0] == '/': + url = 'https://revista22.ro' + url + article_title = self.tag_to_string(a.find('h3')).strip() + author = self.tag_to_string( + container.find('span', attrs={'class': 'text-red'}) + ).strip() + summary = self.tag_to_string(container.find('p')).strip() + pubdate = self.tag_to_string(a.find('span')) + description = author + ' - ' + summary + articles.append( + dict( + title=article_title, + url=url, + date=pubdate, + description=description, + content='' + ) + ) + + sections = [('Numărul curent', articles)] + return sections \ No newline at end of file diff --git a/recipes/volkskrant.png b/recipes/volkskrant.png new file mode 100644 index 0000000000..50a432ac85 Binary files /dev/null and b/recipes/volkskrant.png differ diff --git a/recipes/volksrant.recipe b/recipes/volkskrant.recipe similarity index 79% rename from recipes/volksrant.recipe rename to recipes/volkskrant.recipe index 9a116aa7ce..6c80b890bd 100644 --- a/recipes/volksrant.recipe +++ b/recipes/volkskrant.recipe @@ -1,8 +1,9 @@ #!/usr/bin/env python -import uuid - from calibre.web.feeds.recipes import BasicNewsRecipe - +import uuid +from mechanize import Request +from contextlib import closing +import json class Volkskrant(BasicNewsRecipe): title = 'Volkskrant' @@ -95,4 +96,25 @@ class Volkskrant(BasicNewsRecipe): if tag.name == 'img': if tag['src'][0] == '/': tag['src'] = 'https://www.volkskrant.nl' + tag['src'] + + for tag in soup(): + if tag.name == "picture": + tag.replaceWith(tag.find("img")) + + comic_articles = { "Bas van der Schot", "Poldermodellen", "Gummbah", "Sigmund" } + if self.tag_to_string(soup.find('h1')).strip() in comic_articles: + for node in soup.find('figure').find_next_siblings(): + node.extract() return soup + + def get_cover_url(self): + headers = { + 'X-Requested-With': 'XMLHttpRequest', + 'Accept': 'application/json, text/javascript, */*; q=0.01', + 'DNT': '1', + } + url = "https://login-api.e-pages.dk/v1/krant.volkskrant.nl/folders" + with closing(self.browser.open(Request(url, None, headers))) as r: + folders = json.loads(r.read()) + return folders["objects"][0]["teaser_medium"] + return None