diff --git a/resources/recipes/london_free_press.recipe b/resources/recipes/london_free_press.recipe new file mode 100644 index 0000000000..82f6a2abc9 --- /dev/null +++ b/resources/recipes/london_free_press.recipe @@ -0,0 +1,38 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class LondonFreePress(BasicNewsRecipe): + title = u'London Free Press' + __author__ = 'rty' + oldest_article = 4 + max_articles_per_feed = 100 + + pubisher = 'lfpress.com' + description = 'Ontario Canada Newspaper' + category = 'News, Ontario, Canada' + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + language = 'en_CA' + encoding = 'utf-8' + conversion_options = {'linearize_tables':True} + + feeds = [ + (u'News', u'http://www.lfpress.com/news/rss.xml'), + (u'Comment', u'http://www.lfpress.com/comment/rss.xml'), + (u'Entertainment', u'http://www.lfpress.com/entertainment/rss.xml '), + (u'Money', u'http://www.lfpress.com/money/rss.xml '), + (u'Life', u'http://www.lfpress.com/life/rss.xml '), + (u'Sports', u'http://www.lfpress.com/sports/rss.xml ') + ] + + keep_only_tags = [ + dict(name='div', attrs={'id':'article'}), + ] + remove_tags = [ + dict(name='div', attrs={'id':'commentsBottom'}), + dict(name='div', attrs={'class':['leftBox','bottomBox clear']}), + dict(name='ul', attrs={'class':'tabs dl contentSwap'}), + ] + remove_tags_after = [ + dict(name='div', attrs={'class':'bottomBox clear'}), + ] diff --git a/resources/recipes/people_daily.recipe b/resources/recipes/people_daily.recipe new file mode 100644 index 0000000000..4dec2452e2 --- /dev/null +++ b/resources/recipes/people_daily.recipe @@ -0,0 +1,57 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1277129332(BasicNewsRecipe): + title = u'People Daily - China' + oldest_article = 2 + max_articles_per_feed = 100 + __author__ = 'rty' + + pubisher = 'people.com.cn' + description = 'People Daily Newspaper' + language = 'zh' + category = 'News, China' + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + encoding = 'GB2312' + conversion_options = {'linearize_tables':True} + + feeds = [(u'\u56fd\u5185\u65b0\u95fb', u'http://www.people.com.cn/rss/politics.xml'), + (u'\u56fd\u9645\u65b0\u95fb', u'http://www.people.com.cn/rss/world.xml'), + (u'\u7ecf\u6d4e\u65b0\u95fb', u'http://www.people.com.cn/rss/finance.xml'), + (u'\u4f53\u80b2\u65b0\u95fb', u'http://www.people.com.cn/rss/sports.xml'), + (u'\u53f0\u6e7e\u65b0\u95fb', u'http://www.people.com.cn/rss/haixia.xml')] + keep_only_tags = [ + dict(name='div', attrs={'class':'left_content'}), + ] + remove_tags = [ + dict(name='table', attrs={'class':'title'}), + ] + remove_tags_after = [ + dict(name='table', attrs={'class':'bianji'}), + ] + + def append_page(self, soup, appendtag, position): + pager = soup.find('img',attrs={'src':'/img/next_b.gif'}) + if pager: + nexturl = self.INDEX + pager.a['href'] + soup2 = self.index_to_soup(nexturl) + texttag = soup2.find('div', attrs={'class':'left_content'}) + #for it in texttag.findAll(style=True): + # del it['style'] + newpos = len(texttag.contents) + self.append_page(soup2,texttag,newpos) + texttag.extract() + appendtag.insert(position,texttag) + + + def preprocess_html(self, soup): + mtag = '\n' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['form'] + self.append_page(soup, soup.body, 3) + #pager = soup.find('a',attrs={'class':'ab12'}) + #if pager: + # pager.extract() + return soup