import re from calibre.web.feeds.news import BasicNewsRecipe class CNetJapan(BasicNewsRecipe): title = u'CNET Japan' oldest_article = 3 max_articles_per_feed = 30 __author__ = 'Hiroshi Miura' feeds = [(u'CNet News', u'http://feed.japan.cnet.com/rss/index.rdf'), (u'CNet Blog', u'http://feed.japan.cnet.com/rss/blog/index.rdf') ] language = 'ja' encoding = 'utf-8' remove_javascript = True preprocess_regexps = [ (re.compile(type(u'')(r'.*'), re.DOTALL | re.IGNORECASE | re.UNICODE), lambda match: ''), (re.compile(r'.*', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(type(u'')(r'.*'), re.UNICODE), lambda match: ''), ] remove_tags_before = dict(id="contents_l") remove_tags = [ {'class': "social_bkm_share"}, {'class': "social_bkm_print"}, {'class': "block20 clearfix"}, dict(name="div", attrs={'id': 'bookreview'}), {'class': "tag_left_ttl"}, {'class': "tag_right"} ] remove_tags_after = {'class': "block20"} def parse_feeds(self): feeds = BasicNewsRecipe.parse_feeds(self) for curfeed in feeds: delList = [] for a, curarticle in enumerate(curfeed.articles): if re.search(r'pheedo.jp', curarticle.url): delList.append(curarticle) if len(delList) > 0: for d in delList: index = curfeed.articles.index(d) curfeed.articles[index:index + 1] = [] return feeds