import re from calibre.web.feeds.news import BasicNewsRecipe class CNetJapan(BasicNewsRecipe): title = u'CNET Japan' oldest_article = 3 max_articles_per_feed = 30 __author__ = 'Hiroshi Miura' feeds = [(u'CNet News', u'http://feed.japan.cnet.com/rss/index.rdf'), (u'CNet Blog', u'http://feed.japan.cnet.com/rss/blog/index.rdf') ] language = 'ja' encoding = 'utf-8' remove_javascript = True preprocess_regexps = [ (re.compile(ur'.*', re.DOTALL|re.IGNORECASE|re.UNICODE), lambda match: ''), (re.compile(r'.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'.*', re.UNICODE), lambda match: ''), ] remove_tags_before = dict(id="contents_l") remove_tags = [ {'class':"social_bkm_share"}, {'class':"social_bkm_print"}, {'class':"block20 clearfix"}, dict(name="div",attrs={'id':'bookreview'}), {'class':"tag_left_ttl"}, {'class':"tag_right"} ] remove_tags_after = {'class':"block20"} def parse_feeds(self): feeds = BasicNewsRecipe.parse_feeds(self) for curfeed in feeds: delList = [] for a,curarticle in enumerate(curfeed.articles): if re.search(r'pheedo.jp', curarticle.url): delList.append(curarticle) if len(delList)>0: for d in delList: index = curfeed.articles.index(d) curfeed.articles[index:index+1] = [] return feeds