from calibre.web.feeds.recipes import BasicNewsRecipe class AdvancedUserRecipe1278063072(BasicNewsRecipe): title = u'Singtao Daily - Canada' oldest_article = 7 max_articles_per_feed = 100 __author__ = 'rty' description = 'Toronto Canada Chinese Newspaper' publisher = 'news.singtao.ca' category = 'Chinese, News, Canada' remove_javascript = True use_embedded_content = False no_stylesheets = True language = 'zh' conversion_options = {'linearize_tables':True} masthead_url = 'http://news.singtao.ca/i/site_2009/logo.jpg' extra_css = ''' @font-face {font-family: "DroidFont", serif, sans-serif; src: url(res:///system/fonts/DroidSansFallback.ttf); }\ body {text-align: justify; margin-right: 8pt; font-family: 'DroidFont', serif;}\ h1 {font-family: 'DroidFont', serif;}\ .articledescription {font-family: 'DroidFont', serif;} ''' keep_only_tags = [ dict(name='div', attrs={'id':['title','storybody']}), dict(name='div', attrs={'class':'content'}) ] def parse_index(self): feeds = [] for title, url in [ ('Editorial', 'http://news.singtao.ca/toronto/editorial.html'), ('Toronto \xe5\x9f\x8e\xe5\xb8\x82/\xe7\xa4\xbe\xe5\x8d\x80'.decode('utf-8'), 'http://news.singtao.ca/toronto/city.html'), ('Canada \xe5\x8a\xa0\xe5\x9c\x8b'.decode('utf-8'), 'http://news.singtao.ca/toronto/canada.html'), ('Entertainment', 'http://news.singtao.ca/toronto/entertainment.html'), ('World', 'http://news.singtao.ca/toronto/world.html'), ('Finance \xe5\x9c\x8b\xe9\x9a\x9b\xe8\xb2\xa1\xe7\xb6\x93'.decode('utf-8'), 'http://news.singtao.ca/toronto/finance.html'), ('Sports', 'http://news.singtao.ca/toronto/sports.html'), ]: articles = self.parse_section(url) if articles: feeds.append((title, articles)) return feeds def parse_section(self, url): soup = self.index_to_soup(url) div = soup.find(attrs={'class': ['newslist paddingL10T10','newslist3 paddingL10T10']}) #date = div.find(attrs={'class': 'underlineBLK'}) current_articles = [] for li in div.findAll('li'): a = li.find('a', href = True) if a is None: continue title = self.tag_to_string(a) url = a.get('href', False) if not url or not title: continue if url.startswith('/'): url = 'http://news.singtao.ca'+url # self.log('\ \ Found article:', title) # self.log('\ \ \ ', url) current_articles.append({'title': title, 'url': url, 'description':''}) return current_articles def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll(width=True): del item['width'] return soup