calibre/recipes/singtao_daily.recipe

from calibre.web.feeds.recipes import BasicNewsRecipe


class AdvancedUserRecipe1278063072(BasicNewsRecipe):
    title = u'Singtao Daily - Canada'
    oldest_article = 7
    max_articles_per_feed = 100
    __author__ = 'rty'
    description = 'Toronto Canada Chinese Newspaper'
    publisher = 'news.singtao.ca'
    category = 'Chinese, News, Canada'
    remove_javascript = True
    use_embedded_content = False
    no_stylesheets = True
    language = 'zh'
    conversion_options = {'linearize_tables': True}
    masthead_url = 'http://news.singtao.ca/i/site_2009/logo.jpg'
    extra_css = '''
        @font-face {font-family: "DroidFont", serif, sans-serif; src: url(res:///system/fonts/DroidSansFallback.ttf); }\

    body {text-align: justify; margin-right: 8pt; font-family: 'DroidFont', serif;}\

                    h1 {font-family: 'DroidFont', serif;}\

                    .articledescription {font-family: 'DroidFont', serif;}
            '''
    keep_only_tags = [
        dict(name='div', attrs={'id': ['title', 'storybody']}),
        dict(name='div', attrs={'class': 'content'})
    ]

    def parse_index(self):
        feeds = []
        for title, url in [
            ('Editorial',
             'http://news.singtao.ca/toronto/editorial.html'),
            ('Toronto   \xe5\x9f\x8e\xe5\xb8\x82/\xe7\xa4\xbe\xe5\x8d\x80'.decode('utf-8'),
             'http://news.singtao.ca/toronto/city.html'),
            ('Canada \xe5\x8a\xa0\xe5\x9c\x8b'.decode('utf-8'),
             'http://news.singtao.ca/toronto/canada.html'),
            ('Entertainment',
             'http://news.singtao.ca/toronto/entertainment.html'),
            ('World',
             'http://news.singtao.ca/toronto/world.html'),
            ('Finance \xe5\x9c\x8b\xe9\x9a\x9b\xe8\xb2\xa1\xe7\xb6\x93'.decode('utf-8'),
             'http://news.singtao.ca/toronto/finance.html'),
            ('Sports', 'http://news.singtao.ca/toronto/sports.html'),
        ]:
            articles = self.parse_section(url)
            if articles:
                feeds.append((title, articles))
        return feeds

    def parse_section(self, url):
        soup = self.index_to_soup(url)
        div = soup.find(
            attrs={'class': ['newslist paddingL10T10', 'newslist3 paddingL10T10']})
        current_articles = []
        for li in div.findAll('li'):
            a = li.find('a', href=True)
            if a is None:
                continue
            title = self.tag_to_string(a)
            url = a.get('href', False)
            if not url or not title:
                continue
            if url.startswith('/'):
                url = 'http://news.singtao.ca' + url
            current_articles.append(
                {'title': title, 'url': url, 'description': ''})

        return current_articles

    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        for item in soup.findAll(width=True):
            del item['width']
        return soup