diff --git a/recipes/aif_ru.recipe b/recipes/aif_ru.recipe index 086a9e04c9..bf5c7d3b5b 100644 --- a/recipes/aif_ru.recipe +++ b/recipes/aif_ru.recipe @@ -1,45 +1,24 @@ #!/usr/bin/env python # vim:fileencoding=utf-8 -from __future__ import with_statement, unicode_literals +from __future__ import unicode_literals from calibre.web.feeds.news import BasicNewsRecipe -class AdvancedUserRecipe1592177429(BasicNewsRecipe): - title = 'Аргументы и Факты' - encoding = 'utf8' - language = 'ru' +class AdvancedUserRecipe1645564525(BasicNewsRecipe): + title = 'Аргументы и Факты - aif.ru' + __author__ = 'MrBeef12' oldest_article = 7 - max_articles_per_feed = 25 - verbose = 3 + max_articles_per_feed = 100 + auto_cleanup = True + language = 'ru' - feeds = [ - ('AIF', 'https://www.aif.ru/rss/all.php'), + feeds = [ + ('ПОЛИТИКА', 'https://aif.ru/rss/politics.php'), + ('ДЕНЬГИ', 'https://aif.ru/rss/money.php'), + ('ОБЩЕСТВО', 'https://aif.ru/rss/society.php'), + ('ЗДОРОВЬЕ', 'https://aif.ru/rss/health.php'), + ('КУЛЬТУРА', 'https://aif.ru/rss/culture.php'), + ('СПОРТ', 'https://aif.ru/rss/sport.php'), + ('АВТОМОБИЛИ', 'https://aif.ru/rss/automobile.php'), + ('НЕДВИЖИМОСТЬ', 'https://aif.ru/rss/realty.php'), ] - INDEX = 'https://www.aif.ru/rss/all.php' - - def parse_index(self): - feeds = [] - section_title = 'aif' - articles = [] - soup = self.index_to_soup(self.INDEX) - ii = 0 - for item in soup.findAll('item'): - if ii < self.max_articles_per_feed: - try: - ii = ii + 1 - A = str(item) - i = A.find(u'link') - j = A.find(u'description') - ZZ = item.find('description') - ZZ1 = str(ZZ) # bs4.element.Tag to str - ZZ2 = ZZ1[24:-19] - AU = A[i:j] - try: - articles.append({'url':AU[6:-2], 'title':ZZ2}) - except Exception: - pass - except Exception: - self.log("Exception handled!") - if articles: - feeds.append((section_title, articles)) - return feeds diff --git a/recipes/lenta_ru.recipe b/recipes/lenta_ru.recipe index 8275c1a989..a406916664 100644 --- a/recipes/lenta_ru.recipe +++ b/recipes/lenta_ru.recipe @@ -1,181 +1,17 @@ #!/usr/bin/env python - -''' -Lenta.ru -''' - -from calibre.web.feeds.feedparser import parse -from calibre.ebooks.BeautifulSoup import Tag +# vim:fileencoding=utf-8 +from __future__ import unicode_literals from calibre.web.feeds.news import BasicNewsRecipe -import re -def new_tag(soup, name, attrs=()): - impl = getattr(soup, 'new_tag', None) - if impl is not None: - return impl(name, attrs=dict(attrs)) - return Tag(soup, name, attrs=attrs or None) - - -class LentaRURecipe(BasicNewsRecipe): - title = u'Lenta.ru: \u041d\u043e\u0432\u043e\u0441\u0442\u0438' - __author__ = 'Nikolai Kotchetkov' - publisher = 'lenta.ru' - category = 'news, Russia' - description = u'''\u0415\u0436\u0435\u0434\u043d\u0435\u0432\u043d\u0430\u044f - \u0438\u043d\u0442\u0435\u0440\u043d\u0435\u0442-\u0433\u0430\u0437\u0435\u0442\u0430. - \u041d\u043e\u0432\u043e\u0441\u0442\u0438 \u0441\u043e - \u0432\u0441\u0435\u0433\u043e \u043c\u0438\u0440\u0430 \u043d\u0430 - \u0440\u0443\u0441\u0441\u043a\u043e\u043c - \u044f\u0437\u044b\u043a\u0435''' - description = u'Ежедневная интернет-газета. Новости со всего мира на русском языке' - oldest_article = 3 +class AdvancedUserRecipe1645563203(BasicNewsRecipe): + title = 'Lenta.ru - Новости' + __author__ = 'MrBeef12' + oldest_article = 7 max_articles_per_feed = 100 - - masthead_url = u'http://img.lenta.ru/i/logowrambler.gif' - cover_url = u'http://img.lenta.ru/i/logowrambler.gif' - - # Add feed names if you want them to be sorted (feeds of this list appear - # first) - sortOrder = [u'_default', u'В России', u'б.СССР', u'В мире'] - - encoding = 'cp1251' + auto_cleanup = True language = 'ru' - no_stylesheets = True - remove_javascript = True - recursions = 0 - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language - } - - keep_only_tags = [dict(name='td', attrs={'class': ['statya', 'content']})] - - remove_tags_after = [dict(name='p', attrs={'class': 'links'}), dict( - name='div', attrs={'id': 'readers-block'})] - - remove_tags = [dict(name='table', attrs={'class': ['vrezka', 'content']}), dict(name='div', attrs={ - 'class': 'b240'}), dict(name='div', attrs={'id': 'readers-block'}), dict(name='p', attrs={'class': 'links'})] - - feeds = [u'http://lenta.ru/rss/'] - - extra_css = 'h1 {font-size: 1.2em; margin: 0em 0em 0em 0em;} h2 {font-size: 1.0em; margin: 0em 0em 0em 0em;} h3 {font-size: 0.8em; margin: 0em 0em 0em 0em;}' # noqa - - def parse_index(self): - try: - feedData = parse(self.feeds[0]) - if not feedData: - raise NotImplementedError - self.log("parse_index: Feed loaded successfully.") - - def get_virtual_feed_articles(feed): - if feed in feeds: - return feeds[feed][1] - self.log("Adding new feed: ", feed) - articles = [] - feeds[feed] = (feed, articles) - return articles - - feeds = {} - - # Iterate feed items and distribute articles using tags - for item in feedData.entries: - link = item.get('link', '') - title = item.get('title', '') - if '' == link or '' == title: - continue - article = {'title': title, 'url': link, 'description': item.get( - 'description', ''), 'date': item.get('date', ''), 'content': ''} - if not item.get('tags'): - get_virtual_feed_articles('_default').append(article) - continue - for tag in item.tags: - addedToDefault = False - term = tag.get('term', '') - if '' == term: - if (not addedToDefault): - get_virtual_feed_articles( - '_default').append(article) - continue - get_virtual_feed_articles(term).append(article) - - # Get feed list - # Select sorted feeds first of all - result = [] - for feedName in self.sortOrder: - if (not feeds.get(feedName)): - continue - result.append(feeds[feedName]) - del feeds[feedName] - result = result + feeds.values() - - return result - - except Exception as err: - self.log(err) - raise NotImplementedError - - def preprocess_html(self, soup): - return self.adeify_images(soup) - - def postprocess_html(self, soup, first_fetch): - - contents = new_tag(soup, 'div') - - # Extract tags with given attributes - extractElements = {'div': [{'id': 'readers-block'}]} - - # Remove all elements that were not extracted before - for tag, attrs in extractElements.items(): - for attr in attrs: - garbage = soup.findAll(tag, attr) - if garbage: - for pieceOfGarbage in garbage: - pieceOfGarbage.extract() - - # Find article text using header - # and add all elements to contents - element = soup.find({'h1': True, 'h2': True}) - if (element): - element.name = 'h1' - while element: - nextElement = element.nextSibling - element.extract() - contents.insert(len(contents.contents), element) - element = nextElement - - # Place article date after header - dates = soup.findAll(text=re.compile( - r'\d{2}\.\d{2}\.\d{4}, \d{2}:\d{2}:\d{2}')) - if dates: - for date in dates: - for string in date: - parent = date.parent - if (parent and isinstance(parent, Tag) and 'div' == parent.name and 'dt' == ''.join(parent['class'])): - # Date div found - parent.extract() - parent[ - 'style'] = 'font-size: 0.5em; color: gray; font-family: monospace;' - contents.insert(1, parent) - break - - # Place article picture after date - pic = soup.find('img') - if pic: - picDiv = new_tag(soup, 'div') - picDiv['style'] = 'width: 100%; text-align: center;' - pic.extract() - picDiv.insert(0, pic) - title = pic.get('title', None) - if title: - titleDiv = new_tag(soup, 'div') - titleDiv['style'] = 'font-size: 0.5em;' - titleDiv.insert(0, title) - picDiv.insert(1, titleDiv) - contents.insert(2, picDiv) - - body = soup.find('td', {'class': ['statya', 'content']}) - if body: - body.replaceWith(contents) - - return soup + feeds = [ + ('Lenta.ru - Новости России и мира сегодня', 'https://lenta.ru/rss/'), + ]