from calibre.web.feeds.recipes import BasicNewsRecipe class AdvancedUserRecipe1278063072(BasicNewsRecipe): title = u'Singtao Daily - Canada' oldest_article = 7 max_articles_per_feed = 100 __author__ = 'rty' description = 'Toronto Canada Chinese Newspaper' publisher = 'news.singtao.ca' category = 'Chinese, News, Canada' remove_javascript = True use_embedded_content = False no_stylesheets = True language = 'zh' conversion_options = {'linearize_tables': True} masthead_url = 'http://news.singtao.ca/i/site_2009/logo.jpg' extra_css = ''' @font-face {font-family: "DroidFont", serif, sans-serif; src: url(res:///system/fonts/DroidSansFallback.ttf); }\ body {text-align: justify; margin-right: 8pt; font-family: 'DroidFont', serif;}\ h1 {font-family: 'DroidFont', serif;}\ .articledescription {font-family: 'DroidFont', serif;} ''' keep_only_tags = [ dict(name='div', attrs={'id': ['title', 'storybody']}), dict(name='div', attrs={'class': 'content'}) ] def parse_index(self): feeds = [] for title, url in [ ('Editorial', 'http://news.singtao.ca/toronto/editorial.html'), ('Toronto \xe5\x9f\x8e\xe5\xb8\x82/\xe7\xa4\xbe\xe5\x8d\x80'.decode('utf-8'), 'http://news.singtao.ca/toronto/city.html'), ('Canada \xe5\x8a\xa0\xe5\x9c\x8b'.decode('utf-8'), 'http://news.singtao.ca/toronto/canada.html'), ('Entertainment', 'http://news.singtao.ca/toronto/entertainment.html'), ('World', 'http://news.singtao.ca/toronto/world.html'), ('Finance \xe5\x9c\x8b\xe9\x9a\x9b\xe8\xb2\xa1\xe7\xb6\x93'.decode('utf-8'), 'http://news.singtao.ca/toronto/finance.html'), ('Sports', 'http://news.singtao.ca/toronto/sports.html'), ]: articles = self.parse_section(url) if articles: feeds.append((title, articles)) return feeds def parse_section(self, url): soup = self.index_to_soup(url) div = soup.find( attrs={'class': ['newslist paddingL10T10', 'newslist3 paddingL10T10']}) current_articles = [] for li in div.findAll('li'): a = li.find('a', href=True) if a is None: continue title = self.tag_to_string(a) url = a.get('href', False) if not url or not title: continue if url.startswith('/'): url = 'http://news.singtao.ca' + url current_articles.append( {'title': title, 'url': url, 'description': ''}) return current_articles def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll(width=True): del item['width'] return soup