calibre/recipes/lenta_ru.recipe
Kovid Goyal 567040ee1e Perform PEP8 compliance checks on the entire codebase
Some bits of PEP 8 are turned off via setup.cfg
2016-07-29 21:25:17 +05:30

180 lines
6.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python2
'''
Lenta.ru
'''
from calibre.web.feeds.feedparser import parse
from calibre.ebooks.BeautifulSoup import Tag
from calibre.web.feeds.news import BasicNewsRecipe
import re
class LentaRURecipe(BasicNewsRecipe):
title = u'Lenta.ru: \u041d\u043e\u0432\u043e\u0441\u0442\u0438'
__author__ = 'Nikolai Kotchetkov'
publisher = 'lenta.ru'
category = 'news, Russia'
description = u'''\u0415\u0436\u0435\u0434\u043d\u0435\u0432\u043d\u0430\u044f
\u0438\u043d\u0442\u0435\u0440\u043d\u0435\u0442-\u0433\u0430\u0437\u0435\u0442\u0430.
\u041d\u043e\u0432\u043e\u0441\u0442\u0438 \u0441\u043e
\u0432\u0441\u0435\u0433\u043e \u043c\u0438\u0440\u0430 \u043d\u0430
\u0440\u0443\u0441\u0441\u043a\u043e\u043c
\u044f\u0437\u044b\u043a\u0435'''
description = u'Ежедневная интернет-газета. Новости со всего мира на русском языке'
oldest_article = 3
max_articles_per_feed = 100
masthead_url = u'http://img.lenta.ru/i/logowrambler.gif'
cover_url = u'http://img.lenta.ru/i/logowrambler.gif'
# Add feed names if you want them to be sorted (feeds of this list appear
# first)
sortOrder = [u'_default', u'В России', u'б.СССР', u'В мире']
encoding = 'cp1251'
language = 'ru'
no_stylesheets = True
remove_javascript = True
recursions = 0
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
keep_only_tags = [dict(name='td', attrs={'class': ['statya', 'content']})]
remove_tags_after = [dict(name='p', attrs={'class': 'links'}), dict(
name='div', attrs={'id': 'readers-block'})]
remove_tags = [dict(name='table', attrs={'class': ['vrezka', 'content']}), dict(name='div', attrs={
'class': 'b240'}), dict(name='div', attrs={'id': 'readers-block'}), dict(name='p', attrs={'class': 'links'})]
feeds = [u'http://lenta.ru/rss/']
extra_css = 'h1 {font-size: 1.2em; margin: 0em 0em 0em 0em;} h2 {font-size: 1.0em; margin: 0em 0em 0em 0em;} h3 {font-size: 0.8em; margin: 0em 0em 0em 0em;}' # noqa
def parse_index(self):
try:
feedData = parse(self.feeds[0])
if not feedData:
raise NotImplementedError
self.log("parse_index: Feed loaded successfully.")
if feedData.feed.has_key('title'): # noqa
self.title = feedData.feed.title
self.log("parse_index: Title updated to: ", self.title)
if feedData.feed.has_key('image'): # noqa
self.log("HAS IMAGE!!!!")
def get_virtual_feed_articles(feed):
if feeds.has_key(feed): # noqa
return feeds[feed][1]
self.log("Adding new feed: ", feed)
articles = []
feeds[feed] = (feed, articles)
return articles
feeds = {}
# Iterate feed items and distribute articles using tags
for item in feedData.entries:
link = item.get('link', '')
title = item.get('title', '')
if '' == link or '' == title:
continue
article = {'title': title, 'url': link, 'description': item.get(
'description', ''), 'date': item.get('date', ''), 'content': ''}
if not item.has_key('tags'): # noqa
get_virtual_feed_articles('_default').append(article)
continue
for tag in item.tags:
addedToDefault = False
term = tag.get('term', '')
if '' == term:
if (not addedToDefault):
get_virtual_feed_articles(
'_default').append(article)
continue
get_virtual_feed_articles(term).append(article)
# Get feed list
# Select sorted feeds first of all
result = []
for feedName in self.sortOrder:
if (not feeds.has_key(feedName)): # noqa
continue
result.append(feeds[feedName])
del feeds[feedName]
result = result + feeds.values()
return result
except Exception, err:
self.log(err)
raise NotImplementedError
def preprocess_html(self, soup):
return self.adeify_images(soup)
def postprocess_html(self, soup, first_fetch):
contents = Tag(soup, 'div')
# Extract tags with given attributes
extractElements = {'div': [{'id': 'readers-block'}]}
# Remove all elements that were not extracted before
for tag, attrs in extractElements.iteritems():
for attr in attrs:
garbage = soup.findAll(tag, attr)
if garbage:
for pieceOfGarbage in garbage:
pieceOfGarbage.extract()
# Find article text using header
# and add all elements to contents
element = soup.find({'h1': True, 'h2': True})
if (element):
element.name = 'h1'
while element:
nextElement = element.nextSibling
element.extract()
contents.insert(len(contents.contents), element)
element = nextElement
# Place article date after header
dates = soup.findAll(text=re.compile(
'\d{2}\.\d{2}\.\d{4}, \d{2}:\d{2}:\d{2}'))
if dates:
for date in dates:
for string in date:
parent = date.parent
if (parent and isinstance(parent, Tag) and 'div' == parent.name and 'dt' == parent['class']):
# Date div found
parent.extract()
parent[
'style'] = 'font-size: 0.5em; color: gray; font-family: monospace;'
contents.insert(1, parent)
break
# Place article picture after date
pic = soup.find('img')
if pic:
picDiv = Tag(soup, 'div')
picDiv['style'] = 'width: 100%; text-align: center;'
pic.extract()
picDiv.insert(0, pic)
title = pic.get('title', None)
if title:
titleDiv = Tag(soup, 'div')
titleDiv['style'] = 'font-size: 0.5em;'
titleDiv.insert(0, title)
picDiv.insert(1, titleDiv)
contents.insert(2, picDiv)
body = soup.find('td', {'class': ['statya', 'content']})
if body:
body.replaceWith(contents)
return soup