calibre/recipes/china_press.recipe

from calibre.web.feeds.news import BasicNewsRecipe

class AdvancedUserRecipe1277228948(BasicNewsRecipe):
    title          = u'China Press USA'
    oldest_article = 7
    max_articles_per_feed = 100

    __author__            = 'rty'
    __version__            = '1.0'
    language = 'zh'
    pubisher  = 'www.chinapressusa.com'
    description           = 'Overseas Chinese Network Newspaper in the USA'
    category              = 'News in Chinese, USA'
    remove_javascript = True
    use_embedded_content   = False
    no_stylesheets = True
    #encoding               = 'GB2312'
    encoding               = 'UTF-8'
    conversion_options = {'linearize_tables':True}
    masthead_url ='http://www.chinapressusa.com/common/images/logo.gif'
    extra_css = '''
             @font-face { font-family: "DroidFont", serif, sans-serif;  src: url(res:///system/fonts/DroidSansFallback.ttf); }\n
             body {
                  margin-right: 8pt;
                  font-family: 'DroidFont', serif;}
              h1  {font-family: 'DroidFont', serif, sans-serif}
            .show {font-family: 'DroidFont', serif, sans-serif}
            '''
    feeds          = [
	(u'\u65b0\u95fb\u9891\u9053', u'http://news.uschinapress.com/news.xml'),
	(u'\u534e\u4eba\u9891\u9053', u'http://chinese.uschinapress.com/chinese.xml'),
	(u'\u8bc4\u8bba\u9891\u9053', u'http://review.uschinapress.com/review.xml'),
	]
    keep_only_tags = [
                              dict(name='div', attrs={'class':'show'}),
                               ]
    remove_tags = [
     #               dict(name='table', attrs={'class':'xle'}),
                    dict(name='div', attrs={'class':'time'}),
                         ]
    remove_tags_after = [
                  dict(name='div', attrs={'class':'bank17'}),
         #         dict(name='a', attrs={'class':'ab12'}),
                         ]


    def append_page(self, soup, appendtag, position):
        pager = soup.find('div',attrs={'id':'displaypagenum'})
        if pager:
           nexturl = self.INDEX + pager.a['href']
           soup2 = self.index_to_soup(nexturl)
           texttag = soup2.find('div', attrs={'class':'show'})
           for it in texttag.findAll(style=True):
               del it['style']
           newpos = len(texttag.contents)
           self.append_page(soup2,texttag,newpos)
           texttag.extract()
           appendtag.insert(position,texttag)


    def preprocess_html(self, soup):
        mtag = '<meta http-equiv="Content-Language" content="zh-CN"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
        soup.head.insert(0,mtag)

        for item in soup.findAll(style=True):
            del item['style']
        self.append_page(soup, soup.body, 3)
        pager = soup.find('div',attrs={'id':'displaypagenum'})
        if pager:
           pager.extract()
        return soup