mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
The website https://www.equestriadaily.com/ often uses images in its articles, and from what I have seen the images get compressed so much that text is basically unreadable on them. This fix mitigates this issue by reducing the compression factor from 16 to 4, which slightly increases the size of the newspaper but makes it more enjoyable and readable.
82 lines
2.9 KiB
Python
82 lines
2.9 KiB
Python
#!/usr/bin/env python
|
|
# vim:fileencoding=utf-8
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
from calibre.utils.date import parse_date, utcnow
|
|
|
|
|
|
class AdvancedUserRecipe1639926896(BasicNewsRecipe):
|
|
__author__ = "Aisteru"
|
|
__copyright__ = "2021, Timothée Andres <timothee dot andres at gmail dot com>"
|
|
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
|
|
|
title = "Equestria Daily"
|
|
description = "Everything new in Equestria and beyond!"
|
|
language = 'en_US'
|
|
|
|
# Max. supported by website: 50
|
|
max_articles_per_feed = 30
|
|
|
|
compress_news_images = True
|
|
compress_news_images_auto_size = 4
|
|
no_stylesheets = True
|
|
keep_only_tags = [{'name': 'div', 'class_': ['post', 'hentry']}]
|
|
remove_tags = [{'name': 'div', 'class_': 'post-footer'}]
|
|
extra_css = '.article_date { margin-left: 10px; }'
|
|
|
|
# Masthead image dimensions
|
|
MI_WIDTH = 600
|
|
MI_HEIGHT = 200
|
|
|
|
# To discard posts under a certain section, simply comment the whole line
|
|
sections = [
|
|
("Art", 'Art'),
|
|
("News", 'News'),
|
|
("Fics", 'Fanfiction'),
|
|
("Media", 'Media'),
|
|
("Comics", 'Comic'),
|
|
("Community", 'Community'),
|
|
("Editorial", 'Editorial'),
|
|
]
|
|
|
|
def get_masthead_url(self):
|
|
soup = self.index_to_soup('https://www.equestriadaily.com')
|
|
img = soup.select_one('#header img')
|
|
return img['src']
|
|
|
|
def parse_index(self):
|
|
results = {}
|
|
current_date = utcnow()
|
|
|
|
def clean_description(description):
|
|
lines = description.split('\n')
|
|
return '\n'.join([line.strip() for line in lines if len(line.strip()) > 0])
|
|
|
|
for (section_name, section_url_name) in self.sections:
|
|
soup = self.index_to_soup(
|
|
f'https://www.equestriadaily.com/search/label/{section_url_name}?max-results={self.max_articles_per_feed}')
|
|
articles = soup.select('div.post.hentry')
|
|
previous_post_date = current_date
|
|
|
|
for article in articles:
|
|
article_entry = {}
|
|
|
|
header = article.select_one('h3 > a')
|
|
article_entry['title'] = header.text
|
|
article_entry['url'] = header['href']
|
|
article_entry['date'] = article.select_one('span.post-timestamp').text.split('\n')[1]
|
|
article_entry['description'] = clean_description(article.select_one('div.entry-content').text)
|
|
article_entry['content'] = '' # Must be empty
|
|
|
|
post_date = previous_post_date
|
|
|
|
try:
|
|
post_date = parse_date(article_entry['date'])
|
|
previous_post_date = post_date
|
|
except Exception:
|
|
pass
|
|
|
|
if (current_date - post_date).days <= self.oldest_article:
|
|
results.setdefault(section_name, []).append(article_entry)
|
|
|
|
return [(section, results[section]) for section in results]
|