calibre/recipes/lenta_ru.recipe
Kovid Goyal 29cd8d64ea
Change shebangs to python from python2
Also remove a few other miscellaneous references to python2
2020-08-22 18:47:51 +05:30

182 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
'''
Lenta.ru
'''
from calibre.web.feeds.feedparser import parse
from calibre.ebooks.BeautifulSoup import Tag
from calibre.web.feeds.news import BasicNewsRecipe
import re
def new_tag(soup, name, attrs=()):
impl = getattr(soup, 'new_tag', None)
if impl is not None:
return impl(name, attrs=dict(attrs))
return Tag(soup, name, attrs=attrs or None)
class LentaRURecipe(BasicNewsRecipe):
title = u'Lenta.ru: \u041d\u043e\u0432\u043e\u0441\u0442\u0438'
__author__ = 'Nikolai Kotchetkov'
publisher = 'lenta.ru'
category = 'news, Russia'
description = u'''\u0415\u0436\u0435\u0434\u043d\u0435\u0432\u043d\u0430\u044f
\u0438\u043d\u0442\u0435\u0440\u043d\u0435\u0442-\u0433\u0430\u0437\u0435\u0442\u0430.
\u041d\u043e\u0432\u043e\u0441\u0442\u0438 \u0441\u043e
\u0432\u0441\u0435\u0433\u043e \u043c\u0438\u0440\u0430 \u043d\u0430
\u0440\u0443\u0441\u0441\u043a\u043e\u043c
\u044f\u0437\u044b\u043a\u0435'''
description = u'Ежедневная интернет-газета. Новости со всего мира на русском языке'
oldest_article = 3
max_articles_per_feed = 100
masthead_url = u'http://img.lenta.ru/i/logowrambler.gif'
cover_url = u'http://img.lenta.ru/i/logowrambler.gif'
# Add feed names if you want them to be sorted (feeds of this list appear
# first)
sortOrder = [u'_default', u'В России', u'б.СССР', u'В мире']
encoding = 'cp1251'
language = 'ru'
no_stylesheets = True
remove_javascript = True
recursions = 0
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
keep_only_tags = [dict(name='td', attrs={'class': ['statya', 'content']})]
remove_tags_after = [dict(name='p', attrs={'class': 'links'}), dict(
name='div', attrs={'id': 'readers-block'})]
remove_tags = [dict(name='table', attrs={'class': ['vrezka', 'content']}), dict(name='div', attrs={
'class': 'b240'}), dict(name='div', attrs={'id': 'readers-block'}), dict(name='p', attrs={'class': 'links'})]
feeds = [u'http://lenta.ru/rss/']
extra_css = 'h1 {font-size: 1.2em; margin: 0em 0em 0em 0em;} h2 {font-size: 1.0em; margin: 0em 0em 0em 0em;} h3 {font-size: 0.8em; margin: 0em 0em 0em 0em;}' # noqa
def parse_index(self):
try:
feedData = parse(self.feeds[0])
if not feedData:
raise NotImplementedError
self.log("parse_index: Feed loaded successfully.")
def get_virtual_feed_articles(feed):
if feed in feeds:
return feeds[feed][1]
self.log("Adding new feed: ", feed)
articles = []
feeds[feed] = (feed, articles)
return articles
feeds = {}
# Iterate feed items and distribute articles using tags
for item in feedData.entries:
link = item.get('link', '')
title = item.get('title', '')
if '' == link or '' == title:
continue
article = {'title': title, 'url': link, 'description': item.get(
'description', ''), 'date': item.get('date', ''), 'content': ''}
if not item.get('tags'):
get_virtual_feed_articles('_default').append(article)
continue
for tag in item.tags:
addedToDefault = False
term = tag.get('term', '')
if '' == term:
if (not addedToDefault):
get_virtual_feed_articles(
'_default').append(article)
continue
get_virtual_feed_articles(term).append(article)
# Get feed list
# Select sorted feeds first of all
result = []
for feedName in self.sortOrder:
if (not feeds.get(feedName)):
continue
result.append(feeds[feedName])
del feeds[feedName]
result = result + feeds.values()
return result
except Exception as err:
self.log(err)
raise NotImplementedError
def preprocess_html(self, soup):
return self.adeify_images(soup)
def postprocess_html(self, soup, first_fetch):
contents = new_tag(soup, 'div')
# Extract tags with given attributes
extractElements = {'div': [{'id': 'readers-block'}]}
# Remove all elements that were not extracted before
for tag, attrs in extractElements.items():
for attr in attrs:
garbage = soup.findAll(tag, attr)
if garbage:
for pieceOfGarbage in garbage:
pieceOfGarbage.extract()
# Find article text using header
# and add all elements to contents
element = soup.find({'h1': True, 'h2': True})
if (element):
element.name = 'h1'
while element:
nextElement = element.nextSibling
element.extract()
contents.insert(len(contents.contents), element)
element = nextElement
# Place article date after header
dates = soup.findAll(text=re.compile(
r'\d{2}\.\d{2}\.\d{4}, \d{2}:\d{2}:\d{2}'))
if dates:
for date in dates:
for string in date:
parent = date.parent
if (parent and isinstance(parent, Tag) and 'div' == parent.name and 'dt' == ''.join(parent['class'])):
# Date div found
parent.extract()
parent[
'style'] = 'font-size: 0.5em; color: gray; font-family: monospace;'
contents.insert(1, parent)
break
# Place article picture after date
pic = soup.find('img')
if pic:
picDiv = new_tag(soup, 'div')
picDiv['style'] = 'width: 100%; text-align: center;'
pic.extract()
picDiv.insert(0, pic)
title = pic.get('title', None)
if title:
titleDiv = new_tag(soup, 'div')
titleDiv['style'] = 'font-size: 0.5em;'
titleDiv.insert(0, title)
picDiv.insert(1, titleDiv)
contents.insert(2, picDiv)
body = soup.find('td', {'class': ['statya', 'content']})
if body:
body.replaceWith(contents)
return soup