From f962231a7c3be8a7be9163c3e5cacdc20f2dc68c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 5 Dec 2009 20:15:43 -0700 Subject: [PATCH] Implement #4141 (RIA Novosti in english and spanish) --- resources/recipes/rian_eng.recipe | 42 +++++++++++++++++++++++++++++++ resources/recipes/rian_spa.recipe | 41 ++++++++++++++++++++++++++++++ src/calibre/ebooks/pml/pmlml.py | 7 ------ 3 files changed, 83 insertions(+), 7 deletions(-) create mode 100644 resources/recipes/rian_eng.recipe create mode 100644 resources/recipes/rian_spa.recipe diff --git a/resources/recipes/rian_eng.recipe b/resources/recipes/rian_eng.recipe new file mode 100644 index 0000000000..172a50beda --- /dev/null +++ b/resources/recipes/rian_eng.recipe @@ -0,0 +1,42 @@ + +__license__ = 'GPL v3' +__copyright__ = '2009, Darko Miletic ' +''' +en.rian.ru +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Ria_eng(BasicNewsRecipe): + title = 'Ria Novosti' + __author__ = 'Darko Miletic' + description = 'News from Russia in English' + language = 'en' + publisher = 'en.rian.ru' + category = 'news, politics, Russia' + oldest_article = 3 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + encoding = 'utf-8' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + + keep_only_tags = [dict(name='div', attrs={'class':'article'})] + + remove_tags = [ + dict(name=['object','link','iframe','base']) + ,dict(name='div',attrs={'class':['related','mmban','view-story']}) + ,dict(name='span',attrs={'class':'copyright'}) + ] + remove_tags_after = dict(name='div',attrs={'class':'text'}) + + + feeds = [(u'Online news', u'http://en.rian.ru/export/rss2/archive/index.xml')] + diff --git a/resources/recipes/rian_spa.recipe b/resources/recipes/rian_spa.recipe new file mode 100644 index 0000000000..5d2115168b --- /dev/null +++ b/resources/recipes/rian_spa.recipe @@ -0,0 +1,41 @@ + +__license__ = 'GPL v3' +__copyright__ = '2009, Darko Miletic ' +''' +sp.rian.ru +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Ria_eng(BasicNewsRecipe): + title = 'Ria Novosti' + __author__ = 'Darko Miletic' + description = 'Noticias desde Russia en Castellano' + language = 'es' + publisher = 'sp.rian.ru' + category = 'news, politics, Russia' + oldest_article = 3 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + encoding = 'utf-8' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + + keep_only_tags = [dict(name='div', attrs={'class':'articletxt'})] + remove_tags = [dict(name=['object','link','iframe','base'])] + remove_tags_after = dict(name='div',attrs={'class':'text'}) + + + feeds = [(u'Noticias', u'http://sp.rian.ru/export/rss2/index.xml')] + + def print_version(self, url): + return url.replace('.html','-print.html') + + diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index b40870c0b5..b23cd40813 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -14,7 +14,6 @@ from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.pdb.ereader import image_name from calibre.ebooks.pml import unipmlcode -from calibre import entity_to_unicode TAG_MAP = { 'b' : 'B', @@ -158,12 +157,6 @@ class PMLMLizer(object): text = text.replace(u'\xc2', '') text = text.replace(u'\xa0', ' ') - # Turn all html entities into unicode. This should not be necessary as - # lxml should have already done this but we want to be sure it happens. - for entity in set(re.findall('&.+?;', text)): - mo = re.search('(%s)' % entity[1:-1], text) - text = text.replace(entity, entity_to_unicode(mo)) - # Turn all characters that cannot be represented by themself into their # PML code equivelent text = re.sub('[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text)