calibre/recipes/taipei.recipe
Kovid Goyal a629e3bf1a
...
2019-02-26 15:12:07 +05:30

70 lines
2.1 KiB
Plaintext

import re
from calibre.web.feeds.news import BasicNewsRecipe
def classes(classes):
q = frozenset(classes.split(' '))
return dict(
attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
)
class TN(BasicNewsRecipe):
title = u'Taipei Times'
language = 'en_TW'
__author__ = 'Krittika Goyal'
oldest_article = 1 # days
max_articles_per_feed = 25
use_embedded_content = False
no_stylesheets = True
keep_only_tags = [
dict(name='h1'),
dict(name='h3', attrs={'class': 'a'}),
classes('main_ipic reporter text page'),
]
feeds = [
('Front Page', 'http://www.taipeitimes.com/xml/front.rss'),
('Editorials', 'http://www.taipeitimes.com/xml/editorials.rss'),
('Taiwan', 'http://www.taipeitimes.com/xml/taiwan.rss'),
('Features', 'http://www.taipeitimes.com/xml/feat.rss'),
('Business', 'http://www.taipeitimes.com/xml/biz.rss'),
('World', 'http://www.taipeitimes.com/xml/world.rss'),
('Sports', 'http://www.taipeitimes.com/xml/sport.rss'),
]
def preprocess_html(self, soup, *a):
for div in soup.findAll(**classes('page')):
for a in div.findAll('a', href=True):
a['data-calibre-follow-link'] = '1'
if a['href'].startswith('/'):
a['href'] = 'http://www.taipeitimes.com' + a['href']
return soup
recursions = 1
def is_link_wanted(self, url, tag):
digit = re.search(r'/(\d+)$', url)
if digit is not None and tag['data-calibre-follow-link'] == '1' and re.match(r'\d+', self.tag_to_string(tag)) is not None:
if int(digit.group(1)) > 1:
return True
return False
def postprocess_html(self, soup, *a):
for div in soup.findAll(**classes('page')):
div.extract()
return soup
# def parse_index(self):
# return [(
# 'Articles', [{
# 'title':
# 'test',
# 'url':
# 'http://www.taipeitimes.com/News/editorials/archives/2019/02/26/2003710411'
# }]
# )]