Update Taipei Times

2025-08-30 23:00:21 -04:00 · 2019-02-26 07:44:14 +05:30 · 2019-02-26 07:44:14 +05:30 · 7640a27b13
commit 7640a27b13
parent fe291a5958
1 changed files with 53 additions and 14 deletions
--- a/recipes/taipei.recipe
+++ b/recipes/taipei.recipe
@ -1,6 +1,15 @@
+import re
+
 from calibre.web.feeds.news import BasicNewsRecipe


+def classes(classes):
+    q = frozenset(classes.split(' '))
+    return dict(
+        attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
+    )
+
+
 class TN(BasicNewsRecipe):
    title = u'Taipei Times'
    language = 'en_TW'
@ -10,20 +19,50 @@ class TN(BasicNewsRecipe):
    use_embedded_content = False

    no_stylesheets = True
-    auto_cleanup = True
-    auto_cleanup_keep = '//*[@class="main_ipic"]'
+
+    keep_only_tags = [
+        dict(name='h1'),
+        dict(name='h3', attrs={'class': 'a'}),
+        classes('main_ipic reporter text page'),
+    ]

    feeds = [
-        ('Editorials',
-         'http://www.taipeitimes.com/xml/editorials.rss'),
-        ('Taiwan',
-         'http://www.taipeitimes.com/xml/taiwan.rss'),
-        ('Features',
-         'http://www.taipeitimes.com/xml/feat.rss'),
-        ('Business',
-         'http://www.taipeitimes.com/xml/biz.rss'),
-        ('World',
-         'http://www.taipeitimes.com/xml/world.rss'),
-        ('Sports',
-         'http://www.taipeitimes.com/xml/sport.rss'),
+        ('Editorials', 'http://www.taipeitimes.com/xml/editorials.rss'),
+        ('Taiwan', 'http://www.taipeitimes.com/xml/taiwan.rss'),
+        ('Features', 'http://www.taipeitimes.com/xml/feat.rss'),
+        ('Business', 'http://www.taipeitimes.com/xml/biz.rss'),
+        ('World', 'http://www.taipeitimes.com/xml/world.rss'),
+        ('Sports', 'http://www.taipeitimes.com/xml/sport.rss'),
    ]
+
+    def preprocess_html(self, soup, *a):
+        for div in soup.findAll(**classes('page')):
+            for a in div.findAll('a', href=True):
+                a['data-calibre-follow-link'] = '1'
+                if a['href'].startswith('/'):
+                    a['href'] = 'http://www.taipeitimes.com' + a['href']
+        return soup
+
+    recursions = 1
+
+    def is_link_wanted(self, url, tag):
+        digit = re.search(r'/(\d+)$', url)
+        if digit is not None and tag['data-calibre-follow-link'] == '1' and re.match(r'\d+', self.tag_to_string(tag)) is not None:
+            if int(digit.group(1)) > 1:
+                return True
+        return False
+
+    def postprocess_html(self, soup, *a):
+        for div in soup.findAll(**classes('page')):
+            div.extract()
+        return soup
+
+    # def parse_index(self):
+    #     return [(
+    #         'Articles', [{
+    #             'title':
+    #             'test',
+    #             'url':
+    #             'http://www.taipeitimes.com/News/editorials/archives/2019/02/26/2003710411'
+    #         }]
+    #     )]