import re from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict( attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)} ) class TN(BasicNewsRecipe): title = u'Taipei Times' language = 'en_TW' __author__ = 'Krittika Goyal' oldest_article = 1 # days max_articles_per_feed = 25 use_embedded_content = False no_stylesheets = True keep_only_tags = [ dict(name='h1'), dict(name='h3', attrs={'class': 'a'}), classes('main_ipic reporter text page'), ] feeds = [ ('Front Page', 'http://www.taipeitimes.com/xml/front.rss'), ('Editorials', 'http://www.taipeitimes.com/xml/editorials.rss'), ('Taiwan', 'http://www.taipeitimes.com/xml/taiwan.rss'), ('Features', 'http://www.taipeitimes.com/xml/feat.rss'), ('Business', 'http://www.taipeitimes.com/xml/biz.rss'), ('World', 'http://www.taipeitimes.com/xml/world.rss'), ('Sports', 'http://www.taipeitimes.com/xml/sport.rss'), ] def preprocess_html(self, soup, *a): for div in soup.findAll(**classes('page')): for a in div.findAll('a', href=True): a['data-calibre-follow-link'] = '1' if a['href'].startswith('/'): a['href'] = 'http://www.taipeitimes.com' + a['href'] return soup recursions = 1 def is_link_wanted(self, url, tag): digit = re.search(r'/(\d+)$', url) if digit is not None and tag['data-calibre-follow-link'] == '1' and re.match(r'\d+', self.tag_to_string(tag)) is not None: if int(digit.group(1)) > 1: return True return False def postprocess_html(self, soup, *a): for div in soup.findAll(**classes('page')): div.extract() return soup # def parse_index(self): # return [( # 'Articles', [{ # 'title': # 'test', # 'url': # 'http://www.taipeitimes.com/News/editorials/archives/2019/02/26/2003710411' # }] # )]