diff --git a/recipes/taipei.recipe b/recipes/taipei.recipe index fd16f9212b..e2cdbb4cfe 100644 --- a/recipes/taipei.recipe +++ b/recipes/taipei.recipe @@ -1,6 +1,15 @@ +import re + from calibre.web.feeds.news import BasicNewsRecipe +def classes(classes): + q = frozenset(classes.split(' ')) + return dict( + attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)} + ) + + class TN(BasicNewsRecipe): title = u'Taipei Times' language = 'en_TW' @@ -10,20 +19,50 @@ class TN(BasicNewsRecipe): use_embedded_content = False no_stylesheets = True - auto_cleanup = True - auto_cleanup_keep = '//*[@class="main_ipic"]' + + keep_only_tags = [ + dict(name='h1'), + dict(name='h3', attrs={'class': 'a'}), + classes('main_ipic reporter text page'), + ] feeds = [ - ('Editorials', - 'http://www.taipeitimes.com/xml/editorials.rss'), - ('Taiwan', - 'http://www.taipeitimes.com/xml/taiwan.rss'), - ('Features', - 'http://www.taipeitimes.com/xml/feat.rss'), - ('Business', - 'http://www.taipeitimes.com/xml/biz.rss'), - ('World', - 'http://www.taipeitimes.com/xml/world.rss'), - ('Sports', - 'http://www.taipeitimes.com/xml/sport.rss'), + ('Editorials', 'http://www.taipeitimes.com/xml/editorials.rss'), + ('Taiwan', 'http://www.taipeitimes.com/xml/taiwan.rss'), + ('Features', 'http://www.taipeitimes.com/xml/feat.rss'), + ('Business', 'http://www.taipeitimes.com/xml/biz.rss'), + ('World', 'http://www.taipeitimes.com/xml/world.rss'), + ('Sports', 'http://www.taipeitimes.com/xml/sport.rss'), ] + + def preprocess_html(self, soup, *a): + for div in soup.findAll(**classes('page')): + for a in div.findAll('a', href=True): + a['data-calibre-follow-link'] = '1' + if a['href'].startswith('/'): + a['href'] = 'http://www.taipeitimes.com' + a['href'] + return soup + + recursions = 1 + + def is_link_wanted(self, url, tag): + digit = re.search(r'/(\d+)$', url) + if digit is not None and tag['data-calibre-follow-link'] == '1' and re.match(r'\d+', self.tag_to_string(tag)) is not None: + if int(digit.group(1)) > 1: + return True + return False + + def postprocess_html(self, soup, *a): + for div in soup.findAll(**classes('page')): + div.extract() + return soup + + # def parse_index(self): + # return [( + # 'Articles', [{ + # 'title': + # 'test', + # 'url': + # 'http://www.taipeitimes.com/News/editorials/archives/2019/02/26/2003710411' + # }] + # )]