', lambda match : '
'), (r'Learn more about our Privacy Policy.*?', lambda match : ''), ] - ] - + ] + + - feeds = [ ('AP Headlines', 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml?SITE=ORAST&SECTION=HOME'), ('AP US News', 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml?SITE=CAVIC&SECTION=HOME'), ('AP World News', 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml?SITE=SCAND&SECTION=HOME'), @@ -38,4 +38,4 @@ class AssociatedPress(BasicNewsRecipe): ('AP Health News', 'http://hosted.ap.org/lineups/HEALTHHEADS-rss_2.0.xml?SITE=FLDAY&SECTION=HOME'), ('AP Science News', 'http://hosted.ap.org/lineups/SCIENCEHEADS-rss_2.0.xml?SITE=OHCIN&SECTION=HOME'), ('AP Strange News', 'http://hosted.ap.org/lineups/STRANGEHEADS-rss_2.0.xml?SITE=WCNC&SECTION=HOME'), - ] \ No newline at end of file + ] diff --git a/resources/recipes/bbc_chinese.recipe b/resources/recipes/bbc_chinese.recipe new file mode 100644 index 0000000000..e2bff81b90 --- /dev/null +++ b/resources/recipes/bbc_chinese.recipe @@ -0,0 +1,39 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1277443634(BasicNewsRecipe): + title = u'BBC Chinese' + oldest_article = 7 + max_articles_per_feed = 100 + + feeds = [ + (u'\u4e3b\u9875', u'http://www.bbc.co.uk/zhongwen/simp/index.xml'), + (u'\u56fd\u9645\u65b0\u95fb', u'http://www.bbc.co.uk/zhongwen/simp/world/index.xml'), + (u'\u4e24\u5cb8\u4e09\u5730', u'http://www.bbc.co.uk/zhongwen/simp/china/index.xml'), + (u'\u91d1\u878d\u8d22\u7ecf', u'http://www.bbc.co.uk/zhongwen/simp/business/index.xml'), + (u'\u7f51\u4e0a\u4e92\u52a8', u'http://www.bbc.co.uk/zhongwen/simp/interactive/index.xml'), + (u'\u97f3\u89c6\u56fe\u7247', u'http://www.bbc.co.uk/zhongwen/simp/multimedia/index.xml'), + (u'\u5206\u6790\u8bc4\u8bba', u'http://www.bbc.co.uk/zhongwen/simp/indepth/index.xml') + ] + extra_css = ''' + @font-face {font-family: "DroidFont", serif, sans-serif; src: url(res:///system/fonts/DroidSansFallback.ttf); }\n + body {margin-right: 8pt; font-family: 'DroidFont', serif;}\n + h1 {font-family: 'DroidFont', serif;}\n + .articledescription {font-family: 'DroidFont', serif;} + ''' + __author__ = 'rty' + __version__ = '1.0' + language = 'zh' + pubisher = 'British Broadcasting Corporation' + description = 'BBC news in Chinese' + category = 'News, Chinese' + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + encoding = 'UTF-8' + conversion_options = {'linearize_tables':True} + masthead_url = 'http://wscdn.bbc.co.uk/zhongwen/simp/images/1024/brand.jpg' + keep_only_tags = [ + dict(name='h1'), + dict(name='p', attrs={'class':['primary-topic','summary']}), + dict(name='div', attrs={'class':['bodytext','datestamp']}), + ] diff --git a/resources/recipes/big_oven.recipe b/resources/recipes/big_oven.recipe new file mode 100644 index 0000000000..e1636daf72 --- /dev/null +++ b/resources/recipes/big_oven.recipe @@ -0,0 +1,64 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class BigOven(BasicNewsRecipe): + title = 'BigOven' + __author__ = 'Starson17' + description = 'Recipes for the Foodie in us all. Registration is free. A fake username and password just gives smaller photos.' + language = 'en' + category = 'news, food, recipes, gourmet' + publisher = 'Starson17' + use_embedded_content= False + no_stylesheets = True + oldest_article = 24 + remove_javascript = True + remove_empty_feeds = True + cover_url = 'http://www.software.com/images/products/BigOven%20Logo_177_216.JPG' + max_articles_per_feed = 30 + needs_subscription = True + + conversion_options = {'linearize_tables' : True + , 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('http://www.bigoven.com/') + br.select_form(name='form1') + br['TopMenu_bo1$email'] = self.username + br['TopMenu_bo1$password'] = self.password + br.submit() + return br + + remove_attributes = ['style', 'font'] + + keep_only_tags = [dict(name='h1') + ,dict(name='div', attrs={'class':'img'}) + ,dict(name='div', attrs={'id':'intro'}) + ] + + remove_tags = [dict(name='div', attrs={'style':["overflow: visible;"]}) + ,dict(name='div', attrs={'class':['ctas']}) + #,dict(name='a', attrs={'class':['edit']}) + ,dict(name='p', attrs={'class':['byline']}) + ] + + feeds = [(u'4 & 5 Star Rated Recipes', u'http://feeds.feedburner.com/Bigovencom-RecipeRaves?format=xml')] + + def preprocess_html(self, soup): + for tag in soup.findAll(name='a', attrs={'class':['edit']}): + tag.parent.extract() + for tag in soup.findAll(name='a', attrs={'class':['deflink']}): + tag.replaceWith(tag.string) + return soup + + extra_css = ''' + h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} + h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:medium;} + p{font-family:Arial,Helvetica,sans-serif;font-size:small;} + body{font-family:Helvetica,Arial,sans-serif;font-size:small;} + ''' + diff --git a/resources/recipes/china_economic_net.recipe b/resources/recipes/china_economic_net.recipe new file mode 100644 index 0000000000..825ea007c2 --- /dev/null +++ b/resources/recipes/china_economic_net.recipe @@ -0,0 +1,39 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1278162597(BasicNewsRecipe): + __author__ = 'rty' + title = u'China Economic Net' + oldest_article = 7 + max_articles_per_feed = 100 + + pubisher = 'www.ce.cn - China Economic net - Beijing' + description = 'China Economic Net Magazine' + category = 'Economic News Magazine, Chinese, China' + feeds = [ + (u'Stock Market 股市', u'http://finance.ce.cn/stock/index_6304.xml'), + (u'Money 理财', u'http://finance.ce.cn/money/index_6301.xml'), + (u'Health 健康', u'http://www.ce.cn/health/index_6294.xml'), + (u'Technology 科技', u'http://sci.ce.cn/mainpage/index_6307.xml'), + (u'Domestic Politics 国内时政', u'http://www.ce.cn/xwzx/gnsz/index_6273.xml') + ] + masthead_url = 'http://finance.ce.cn/images/08mdy_logo.gif' + extra_css = ''' + @font-face {font-family: "DroidFont", serif, sans-serif; src: url(res:///system/fonts/DroidSansFallback.ttf); }\n + body {margin-right: 8pt; font-family: 'DroidFont', serif;}\n + h1 {font-family: 'DroidFont', serif;}\n + .articledescription {font-family: 'DroidFont', serif;} + ''' + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + language = 'zh-cn' + encoding = 'gb2312' + conversion_options = {'linearize_tables':True} + + + keep_only_tags = [ + + dict(name='h1', attrs={'id':'articleTitle'}), + dict(name='div', attrs={'class':'laiyuan'}), + dict(name='div', attrs={'id':'articleText'}), + ] diff --git a/resources/recipes/china_press.recipe b/resources/recipes/china_press.recipe new file mode 100644 index 0000000000..502ebfd41c --- /dev/null +++ b/resources/recipes/china_press.recipe @@ -0,0 +1,71 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1277228948(BasicNewsRecipe): + title = u'China Press USA' + oldest_article = 7 + max_articles_per_feed = 100 + + __author__ = 'rty' + __version__ = '1.0' + language = 'zh' + pubisher = 'www.chinapressusa.com' + description = 'Overseas Chinese Network Newspaper in the USA' + category = 'News in Chinese, USA' + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + #encoding = 'GB2312' + encoding = 'UTF-8' + conversion_options = {'linearize_tables':True} + masthead_url ='http://www.chinapressusa.com/common/images/logo.gif' + extra_css = ''' + @font-face { font-family: "DroidFont", serif, sans-serif; src: url(res:///system/fonts/DroidSansFallback.ttf); }\n + body { + margin-right: 8pt; + font-family: 'DroidFont', serif;} + h1 {font-family: 'DroidFont', serif, sans-serif} + .show {font-family: 'DroidFont', serif, sans-serif} + ''' + feeds = [ + (u'\u65b0\u95fb\u9891\u9053', u'http://news.uschinapress.com/news.xml'), + (u'\u534e\u4eba\u9891\u9053', u'http://chinese.uschinapress.com/chinese.xml'), + (u'\u8bc4\u8bba\u9891\u9053', u'http://review.uschinapress.com/review.xml'), + ] + keep_only_tags = [ + dict(name='div', attrs={'class':'show'}), + ] + remove_tags = [ + # dict(name='table', attrs={'class':'xle'}), + dict(name='div', attrs={'class':'time'}), + ] + remove_tags_after = [ + dict(name='div', attrs={'class':'bank17'}), + # dict(name='a', attrs={'class':'ab12'}), + ] + + + def append_page(self, soup, appendtag, position): + pager = soup.find('div',attrs={'id':'displaypagenum'}) + if pager: + nexturl = self.INDEX + pager.a['href'] + soup2 = self.index_to_soup(nexturl) + texttag = soup2.find('div', attrs={'class':'show'}) + for it in texttag.findAll(style=True): + del it['style'] + newpos = len(texttag.contents) + self.append_page(soup2,texttag,newpos) + texttag.extract() + appendtag.insert(position,texttag) + + + def preprocess_html(self, soup): + mtag = '\n' + soup.head.insert(0,mtag) + + for item in soup.findAll(style=True): + del item['style'] + self.append_page(soup, soup.body, 3) + pager = soup.find('div',attrs={'id':'displaypagenum'}) + if pager: + pager.extract() + return soup diff --git a/resources/recipes/editor_and_publisher.recipe b/resources/recipes/editor_and_publisher.recipe index c8f287a0c7..0ec5c59d74 100644 --- a/resources/recipes/editor_and_publisher.recipe +++ b/resources/recipes/editor_and_publisher.recipe @@ -1,14 +1,29 @@ -import re +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2010 elsuave' + from calibre.web.feeds.news import BasicNewsRecipe class EandP(BasicNewsRecipe): title = u'Editor and Publisher' - __author__ = u'Xanthan Gum' + __author__ = u'elsuave (modified from Xanthan Gum)' description = 'News about newspapers and journalism.' + publisher = 'Editor and Publisher' + category = 'news, journalism, industry' language = 'en' - no_stylesheets = True + max_articles_per_feed = 25 + no_stylesheets = True + use_embedded_content = False + encoding = 'utf8' + cover_url = 'http://www.editorandpublisher.com/images/EP_main_logo.gif' + remove_javascript = True - oldest_article = 7 - max_articles_per_feed = 100 + html2lrf_options = [ + '--comment', description + , '--category', category + , '--publisher', publisher + ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' # Font formatting code borrowed from kwetal @@ -18,17 +33,21 @@ class EandP(BasicNewsRecipe): h2{font-size: large;} ''' - # Delete everything before the article + # Keep only div:itemmgap - remove_tags_before = dict(name='font', attrs={'class':'titlebar_black'}) + keep_only_tags = [ + dict(name='div', attrs={'class':'itemmgap'}) + ] - # Delete everything after the article + # Remove commenting/social media lins - preprocess_regexps = [(re.compile(r'.*