From cee89baa64b72918c6b0281ea2399433aea63e1b Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Sun, 21 Nov 2010 22:28:20 +0900 Subject: [PATCH] add: recipe - add MSN Sankei News product releases - fix reuters_ja recipe to remove several tags --- resources/recipes/msnsankei.recipe | 22 ++++++++++++++++++++++ resources/recipes/reuters_ja.recipe | 16 ++++++++++++++-- 2 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 resources/recipes/msnsankei.recipe diff --git a/resources/recipes/msnsankei.recipe b/resources/recipes/msnsankei.recipe new file mode 100644 index 0000000000..61ba0de6dc --- /dev/null +++ b/resources/recipes/msnsankei.recipe @@ -0,0 +1,22 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +sankei.jp.msn.com +''' + +class MSNSankeiNewsProduct(BasicNewsRecipe): + title = u'MSN\u7523\u7d4c\u30cb\u30e5\u30fc\u30b9(\u65b0\u5546\u54c1)' + __author__ = 'Hiroshi Miura' + description = 'Products release from Japan' + oldest_article = 7 + max_articles_per_feed = 100 + encoding = 'Shift_JIS' + language = 'ja' + + feeds = [(u'\u65b0\u5546\u54c1', u'http://sankei.jp.msn.com/rss/news/release.xml')] + + remove_tags_before = dict(id="__r_article_title__") + remove_tags_after = dict(id="ajax_release_news") + remove_tags = [{'class':"parent chromeCustom6G"}] diff --git a/resources/recipes/reuters_ja.recipe b/resources/recipes/reuters_ja.recipe index d926c29096..ffa084bc88 100644 --- a/resources/recipes/reuters_ja.recipe +++ b/resources/recipes/reuters_ja.recipe @@ -1,4 +1,5 @@ from calibre.web.feeds.news import BasicNewsRecipe +import re class ReutersJa(BasicNewsRecipe): @@ -20,6 +21,17 @@ class ReutersJa(BasicNewsRecipe): remove_tags_before = {'class':"article primaryContent"} remove_tags = [ dict(id="banner"), dict(id="autilities"), - dict(id="textSizer") + dict(id="textSizer"), + dict(id="shareFooter"), + dict(id="relatedNews"), + dict(id="editorsChoice"), + dict(id="ecArticles"), + {'class':"secondaryContent"}, + {'class':"module"}, ] - remove_tags_after = dict(id="copyrightNotice") + remove_tags_after = {'class':"assetBuddy"} + + def print_version(self, url): + m = re.search('(.*idJPJAPAN-[0-9]+)', url) + return m.group(0)+'?sp=true' +