From 887b1921487b7f136d4122af8d297719f6d7922f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 19 Jun 2010 17:14:01 -0600 Subject: [PATCH] New recipes: Auto Prove by GB, Forbes India, Maximum PC, Today Online by rty --- resources/recipes/auto_prove.recipe | 90 +++++++++++++++++++++++++++ resources/recipes/forbes_india.recipe | 55 ++++++++++++++++ resources/recipes/maximum_pc.recipe | 43 +++++++++++++ resources/recipes/today_online.recipe | 59 ++++++++++++++++++ 4 files changed, 247 insertions(+) create mode 100644 resources/recipes/auto_prove.recipe create mode 100644 resources/recipes/forbes_india.recipe create mode 100644 resources/recipes/maximum_pc.recipe create mode 100644 resources/recipes/today_online.recipe diff --git a/resources/recipes/auto_prove.recipe b/resources/recipes/auto_prove.recipe new file mode 100644 index 0000000000..109f104206 --- /dev/null +++ b/resources/recipes/auto_prove.recipe @@ -0,0 +1,90 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__author__ = 'GabrieleMarini, based on Darko Miletic' +__copyright__ = '2009, Darko Miletic , Gabriele Marini' +__version__ = 'v1.02 Marini Gabriele ' +__date__ = '10, January 2010' +__description__ = 'Italian daily newspaper' + +''' +http://www.corrieredellosport.it/ +''' +from calibre.web.feeds.news import BasicNewsRecipe + +class AutoPR(BasicNewsRecipe): + __author__ = 'Gabriele Marini' + description = 'Auto and Formula 1' + + cover_url = 'http://www.auto.it/res/imgs/logo_Auto.png' + + + title = u'Auto Prove' + publisher = 'CONTE Editore' + category = 'Sport' + + language = 'it' + timefmt = '[%a, %d %b, %Y]' + + oldest_article = 60 + max_articles_per_feed = 20 + use_embedded_content = False + recursion = 100 + + remove_javascript = True + no_stylesheets = True + + #html2lrf_options = [ + # '--comment', description + # , '--category', category + # , '--publisher', publisher + # , '--ignore-tables' + # ] + + #html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' + + keep_only_tags = [ + dict(name='h2', attrs={'class':['tit_Article y_Txt']}), + dict(name='h2', attrs={'class':['tit_Article']}), + dict(name='div', attrs={'class':['box_Img newsdet_new ']}), + dict(name='div', attrs={'class':['box_Img newsdet_as ']}), + dict(name='table', attrs={'class':['table_A']}), + dict(name='div', attrs={'class':['txt_Article txtBox_cms']}), + dict(name='testoscheda')] + + def parse_index(self): + feeds = [] + for title, url in [ + ("Prove su Strada" , "http://www.auto.it/rss/prove+6.xml") + ]: + soup = self.index_to_soup(url) + soup = soup.find('channel') + print soup + + for article in soup.findAllNext('item'): + title = self.tag_to_string(article.title) + date = self.tag_to_string(article.pubDate) + description = self.tag_to_string(article.description) + link = self.tag_to_string(article.guid) +# print article + articles = self.create_links_append(link, date, description) + if articles: + feeds.append((title, articles)) + return feeds + + def create_links_append(self, link, date, description): + current_articles = [] + + current_articles.append({'title': 'Generale', 'url': link,'description':description, 'date':date}), + current_articles.append({'title': 'Design', 'url': link.replace('scheda','design'),'description':'scheda', 'date':''}), + current_articles.append({'title': 'Interni', 'url': link.replace('scheda','interni'),'description':'Interni', 'date':''}), + current_articles.append({'title': 'Tecnica', 'url': link.replace('scheda','tecnica'),'description':'Tecnica', 'date':''}), + current_articles.append({'title': 'Su Strada', 'url': link.replace('scheda','su_strada'),'description':'Su Strada', 'date':''}), + current_articles.append({'title': 'Pagella', 'url': link.replace('scheda','pagella'),'description':'Pagella', 'date':''}), + current_articles.append({'title': 'Rilevamenti', 'url': link.replace('scheda','telemetria'),'description':'Rilevamenti', 'date':''}) + + return current_articles + + + + + diff --git a/resources/recipes/forbes_india.recipe b/resources/recipes/forbes_india.recipe new file mode 100644 index 0000000000..8567f896b5 --- /dev/null +++ b/resources/recipes/forbes_india.recipe @@ -0,0 +1,55 @@ +from calibre.ptempfile import PersistentTemporaryFile +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1276934715(BasicNewsRecipe): + title = u'Forbes India' + __author__ = 'rty' + description = 'India Edition Forbes' + publisher = 'Forbes India' + category = 'Business News, Economy, India' + oldest_article = 7 + max_articles_per_feed = 100 + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + language = 'en_IN' + temp_files = [] + articles_are_obfuscated = True + conversion_options = {'linearize_tables':True} + feeds = [ + (u'Contents', u'http://business.in.com/rssfeed/rss_all.xml'), + ] + extra_css = ''' + .t-10-gy-l{font-style: italic; font-size: small} + .t-30-b-d{font-weight: bold; font-size: xx-large} + .t-16-gy-l{font-weight: bold; font-size: x-large; font-syle: italic} + .storycontent{font-size: 4px;font-family: Times New Roman;} + ''' + + remove_tags_before = dict(name='div', attrs={'class':'pdl10 pdr15'}) + + + def get_obfuscated_article(self, url): + br = self.get_browser() + br.open(url) + response = br.follow_link(url_regex = r'/printcontent/[0-9]+', nr = 0) + html = response.read() + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write(html) + self.temp_files[-1].close() + return self.temp_files[-1].name + + def get_cover_url(self): + index = 'http://business.in.com/magazine/' + soup = self.index_to_soup(index) + for image in soup.findAll('a',{ "class" : "lbOn a-9-b-d" }): + return image['href'] + #return image['href'] + '.jpg' + return None + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll(width=True): + del item['width'] + return soup diff --git a/resources/recipes/maximum_pc.recipe b/resources/recipes/maximum_pc.recipe new file mode 100644 index 0000000000..3b9c818721 --- /dev/null +++ b/resources/recipes/maximum_pc.recipe @@ -0,0 +1,43 @@ +from calibre.ptempfile import PersistentTemporaryFile +from clibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1276930924(BasicNewsRecipe): + title = u'Maximum PC' + __author__ = 'rty' + description = 'Maximum PC' + publisher = 'http://www.maximumpc.com' + category = 'news, computer, technology' + language = 'en' + oldest_article = 30 + max_articles_per_feed = 100 + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + language = 'en' + temp_files = [] + articles_are_obfuscated = True + feeds = [(u'News', u'http://www.maximumpc.com/articles/4/feed'), + (u'Reviews', u'http://www.maximumpc.com/articles/40/feed'), + (u'Editors Blog', u'http://www.maximumpc.com/articles/6/feed'), + (u'How-to', u'http://www.maximumpc.com/articles/32/feed'), + (u'Features', u'http://www.maximumpc.com/articles/31/feed'), + (u'From the Magazine', u'http://www.maximumpc.com/articles/72/feed') + ] + keep_only_tags = [ + dict(name='div', attrs={'class':['print-title','article_body']}), + ] + remove_tags = [ + dict(name='div', attrs={'class':'comments-tags-actions'}), + ] + remove_tags_before = dict(name='div', attrs={'class':'print-title'}) + remove_tags_after = dict(name='div', attrs={'class':'meta-content'}) + + def get_obfuscated_article(self, url): + br = self.get_browser() + br.open(url) + response = br.follow_link(url_regex = r'/print/[0-9]+', nr = 0) + html = response.read() + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write(html) + self.temp_files[-1].close() + return self.temp_files[-1].name diff --git a/resources/recipes/today_online.recipe b/resources/recipes/today_online.recipe new file mode 100644 index 0000000000..e2fc229473 --- /dev/null +++ b/resources/recipes/today_online.recipe @@ -0,0 +1,59 @@ +from calibre.ptempfile import PersistentTemporaryFile +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1276486274(BasicNewsRecipe): + title = u'Today Online - Singapore' + publisher = 'MediaCorp Press Ltd - Singapore' + __author__ = 'rty' + category = 'news, Singapore' + oldest_article = 7 + max_articles_per_feed = 100 + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + language = 'en_SG' + temp_files = [] + articles_are_obfuscated = True + masthead_url = 'http://www.todayonline.com/App_Themes/Default/images/icons/TodayOnlineLogo.gif' + conversion_options = {'linearize_tables':True} + extra_css = ''' + .author{font-style: italic; font-size: small} + .date{font-style: italic; font-size: small} + .Headline{font-weight: bold; font-size: xx-large} + .headerStrap{font-weight: bold; font-size: x-large; font-syle: italic} + .bodyText{font-size: 4px;font-family: Times New Roman;} + ''' + keep_only_tags = [ + dict(name='div', attrs={'id':['fullPrintBodyHolder']}) + ] + remove_tags_after = [ dict(name='div', attrs={'class':'button'})] + + + remove_tags = [ + dict(name='div', attrs={'class':['url','button']}) + ] + feeds = [ + (u'Singapore', u'http://www.todayonline.com/RSS/Singapore'), + (u'Hot News', u'http://www.todayonline.com/RSS/Hotnews'), + (u'Today Online', u'http://www.todayonline.com/RSS/Todayonline'), + (u'Voices', u'http://www.todayonline.com/RSS/Voices'), + (u'Commentary', u'http://www.todayonline.com/RSS/Commentary'), + (u'World', u'http://www.todayonline.com/RSS/World'), + (u'Business', u'http://www.todayonline.com/RSS/Business'), + (u'Column', u'http://www.todayonline.com/RSS/Columns'), + ] + + def get_obfuscated_article(self, url): + br = self.get_browser() + br.open(url) + response = br.follow_link(url_regex = r'/Print/', nr = 0) + html = response.read() + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write(html) + self.temp_files[-1].close() + return self.temp_files[-1].name + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup