Implement #4141 (RIA Novosti in english and spanish)

This commit is contained in:
Kovid Goyal 2009-12-05 20:15:43 -07:00
parent ae438073b7
commit f962231a7c
3 changed files with 83 additions and 7 deletions

View File

@ -0,0 +1,42 @@
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
en.rian.ru
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Ria_eng(BasicNewsRecipe):
title = 'Ria Novosti'
__author__ = 'Darko Miletic'
description = 'News from Russia in English'
language = 'en'
publisher = 'en.rian.ru'
category = 'news, politics, Russia'
oldest_article = 3
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [dict(name='div', attrs={'class':'article'})]
remove_tags = [
dict(name=['object','link','iframe','base'])
,dict(name='div',attrs={'class':['related','mmban','view-story']})
,dict(name='span',attrs={'class':'copyright'})
]
remove_tags_after = dict(name='div',attrs={'class':'text'})
feeds = [(u'Online news', u'http://en.rian.ru/export/rss2/archive/index.xml')]

View File

@ -0,0 +1,41 @@
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
sp.rian.ru
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Ria_eng(BasicNewsRecipe):
title = 'Ria Novosti'
__author__ = 'Darko Miletic'
description = 'Noticias desde Russia en Castellano'
language = 'es'
publisher = 'sp.rian.ru'
category = 'news, politics, Russia'
oldest_article = 3
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [dict(name='div', attrs={'class':'articletxt'})]
remove_tags = [dict(name=['object','link','iframe','base'])]
remove_tags_after = dict(name='div',attrs={'class':'text'})
feeds = [(u'Noticias', u'http://sp.rian.ru/export/rss2/index.xml')]
def print_version(self, url):
return url.replace('.html','-print.html')

View File

@ -14,7 +14,6 @@ from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.pdb.ereader import image_name
from calibre.ebooks.pml import unipmlcode
from calibre import entity_to_unicode
TAG_MAP = {
'b' : 'B',
@ -158,12 +157,6 @@ class PMLMLizer(object):
text = text.replace(u'\xc2', '')
text = text.replace(u'\xa0', ' ')
# Turn all html entities into unicode. This should not be necessary as
# lxml should have already done this but we want to be sure it happens.
for entity in set(re.findall('&.+?;', text)):
mo = re.search('(%s)' % entity[1:-1], text)
text = text.replace(entity, entity_to_unicode(mo))
# Turn all characters that cannot be represented by themself into their
# PML code equivelent
text = re.sub('[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text)