recipes: add new cnet japan feed

- some removal of tags - skip ad pages
2025-07-09 03:04:10 -04:00 · 2010-11-27 14:32:01 +09:00 · 2010-11-27 14:32:01 +09:00 · c0c1f8225f
commit c0c1f8225f
parent 064bfaa7f9
3 changed files with 118 additions and 2 deletions
--- a/resources/recipes/cnetjapan.recipe
+++ b/resources/recipes/cnetjapan.recipe
@ -7,7 +7,9 @@ class CNetJapan(BasicNewsRecipe):
    max_articles_per_feed = 30
    __author__  = 'Hiroshi Miura'
-    feeds          = [(u'cnet rss', u'http://feeds.japan.cnet.com/cnet/rss')]
+    feeds          = [(u'CNet News', u'http://feed.japan.cnet.com/rss/index.rdf'),
                      (u'CNet Blog', u'http://feed.japan.cnet.com/rss/blog/index.rdf')
                        ]
    language       = 'ja'
    encoding       = 'Shift_JIS'
    remove_javascript = True
@ -21,12 +23,29 @@ class CNetJapan(BasicNewsRecipe):
        lambda match: '<!-- removed -->'),
        ]
-    remove_tags_before = dict(name="h2")
+    remove_tags_before = dict(id="contents_l")
    remove_tags = [
                   {'class':"social_bkm_share"},
                   {'class':"social_bkm_print"},
                   {'class':"block20 clearfix"},
                   dict(name="div",attrs={'id':'bookreview'}),
                   {'class':"tag_left_ttl"},
                   {'class':"tag_right"}
                    ]
    remove_tags_after = {'class':"block20"}
    def parse_feeds(self):
        feeds = BasicNewsRecipe.parse_feeds(self)
        for curfeed in feeds:
            delList = []
            for a,curarticle in enumerate(curfeed.articles):
                if re.search(r'pheedo.jp', curarticle.url):
                    delList.append(curarticle)
            if len(delList)>0:
                for d in delList:
                    index = curfeed.articles.index(d)
                    curfeed.articles[index:index+1] = []
        return feeds
--- a/resources/recipes/cnetjapan_digital.recipe
+++ b/resources/recipes/cnetjapan_digital.recipe
@ -0,0 +1,49 @@
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class CNetJapanDigital(BasicNewsRecipe):
    title          = u'CNET Japan Digital'
    oldest_article = 3
    max_articles_per_feed = 30
    __author__  = 'Hiroshi Miura'
    feeds          = [(u'CNet digital',u'http://feed.japan.cnet.com/rss/digital/index.rdf') ]
    language       = 'ja'
    encoding       = 'Shift_JIS'
    remove_javascript = True
    preprocess_regexps = [
       (re.compile(ur'<!--\u25B2contents_left END\u25B2-->.*</body>', re.DOTALL|re.IGNORECASE|re.UNICODE),
        lambda match: '</body>'),
       (re.compile(r'<!--AD_ELU_HEADER-->.*</body>', re.DOTALL|re.IGNORECASE),
        lambda match: '</body>'),
       (re.compile(ur'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->', re.UNICODE),
        lambda match: '<!-- removed -->'),
        ]
    remove_tags_before = dict(id="contents_l")
    remove_tags = [
                   {'class':"social_bkm_share"},
                   {'class':"social_bkm_print"},
                   {'class':"block20 clearfix"},
                   dict(name="div",attrs={'id':'bookreview'}),
                   {'class':"tag_left_ttl"},
                   {'class':"tag_right"}
                    ]
    remove_tags_after = {'class':"block20"}
    def parse_feeds(self):
        feeds = BasicNewsRecipe.parse_feeds(self)
        for curfeed in feeds:
            delList = []
            for a,curarticle in enumerate(curfeed.articles):
                if re.search(r'pheedo.jp', curarticle.url):
                    delList.append(curarticle)
            if len(delList)>0:
                for d in delList:
                    index = curfeed.articles.index(d)
                    curfeed.articles[index:index+1] = []
        return feeds
--- a/resources/recipes/cnetjapan_release.recipe
+++ b/resources/recipes/cnetjapan_release.recipe
@ -0,0 +1,48 @@
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class CNetJapanRelease(BasicNewsRecipe):
    title          = u'CNET Japan release'
    oldest_article = 3
    max_articles_per_feed = 30
    __author__  = 'Hiroshi Miura'
    feeds          = [(u'CNet Release', u'http://feed.japan.cnet.com/rss/release/index.rdf') ]
    language       = 'ja'
    encoding       = 'Shift_JIS'
    remove_javascript = True
    preprocess_regexps = [
       (re.compile(ur'<!--\u25B2contents_left END\u25B2-->.*</body>', re.DOTALL|re.IGNORECASE|re.UNICODE),
        lambda match: '</body>'),
       (re.compile(r'<!--AD_ELU_HEADER-->.*</body>', re.DOTALL|re.IGNORECASE),
        lambda match: '</body>'),
       (re.compile(ur'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->', re.UNICODE),
        lambda match: '<!-- removed -->'),
        ]
    remove_tags_before = dict(id="contents_l")
    remove_tags = [
                   {'class':"social_bkm_share"},
                   {'class':"social_bkm_print"},
                   {'class':"block20 clearfix"},
                   dict(name="div",attrs={'id':'bookreview'}),
                   {'class':"tag_left_ttl"}
                    ]
    remove_tags_after = {'class':"block20"}
    def parse_feeds(self):
        feeds = BasicNewsRecipe.parse_feeds(self)
        for curfeed in feeds:
            delList = []
            for a,curarticle in enumerate(curfeed.articles):
                if re.search(r'pheedo.jp', curarticle.url):
                    delList.append(curarticle)
            if len(delList)>0:
                for d in delList:
                    index = curfeed.articles.index(d)
                    curfeed.articles[index:index+1] = []
        return feeds