From 5cc6465941057f1a3e2820f37f0750cd48774d26 Mon Sep 17 00:00:00 2001 From: Tom Scholl Date: Mon, 4 Apr 2011 16:35:09 +0000 Subject: [PATCH 1/7] Added fast no image version of the daily mail recipe --- recipes/daily_mail_fast.recipe | 52 ++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 recipes/daily_mail_fast.recipe diff --git a/recipes/daily_mail_fast.recipe b/recipes/daily_mail_fast.recipe new file mode 100644 index 0000000000..9ddb6f77f8 --- /dev/null +++ b/recipes/daily_mail_fast.recipe @@ -0,0 +1,52 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class TheDailyMail(BasicNewsRecipe): + title = u'The Daily Mail (fast)' + oldest_article = 2 + language = 'en_GB' + + author = 'RufusA and Sujata Raman' + description = 'Faster and smaller version of the Daily Mail that does not download pictures' + simultaneous_downloads= 1 + max_articles_per_feed = 50 + + extra_css = '''#js-article-text{font-family:Arial,Helvetica,sans-serif;} + h1{font-size:x-large; font-weight:bold;} + a.author{color:#003580;} + .js-article-text{font-size:50%;} + .imageCaption{font-size:x-small; font-weight:bold} + + + ''' + + remove_tags = [ dict(name='div', attrs={'class':['article-icon-links-container','print-or-mail-links cleared', + 'social-links cleared','explore-links','relatedItems','intellicrumbs box','travel','moduleHalf']}), + dict(name='div', attrs={'id':['link-unit-wrapper','pushdown-ad','reader-comments','googleAds',]}), + dict(name='h3', attrs={'class':['social-links-title']}), + dict(name='span', attrs={'class':['clickToEnlargeTop']}), + dict(name=['img']), + ] + #remove_tags_before = dict(name='div', attrs={'id':'content'}) + keep_only_tags = [dict(name='div', attrs={'id':'js-article-text'})] + + no_stylesheets = True + + feeds = [ + (u'Home', u'http://www.dailymail.co.uk/home/index.rss'), + (u'News', u'http://www.dailymail.co.uk/news/index.rss'), + (u'Sport', u'http://www.dailymail.co.uk/sport/index.rss'), + (u'TV and Showbiz', u'http://www.dailymail.co.uk/tvshowbiz/index.rss'), + (u'Femail', u'http://www.dailymail.co.uk/femail/index.rss'), + (u'Health', u'http://www.dailymail.co.uk/health/index.rss'), + (u'Science and Technology', u'http://www.dailymail.co.uk/sciencetech/index.rss'), + (u'Money', u'http://www.dailymail.co.uk/money/index.rss'), + (u'Property', u'http://www.dailymail.co.uk/property/index.rss'), + (u'Motoring', u'http://www.dailymail.co.uk/motoring/index.rss'), + (u'Travel', u'http://www.dailymail.co.uk/travel/index.rss') + ] + + #def print_version(self, url): + # main = url.partition('?')[0] + # return main + '?printingPage=true' + + From 43b57cb343edcf9f6d17f0b7013ec0c8e2b67d51 Mon Sep 17 00:00:00 2001 From: Tom Scholl Date: Thu, 7 Apr 2011 22:16:12 +0000 Subject: [PATCH 2/7] More detailed sections for guardian recipe --- recipes/guardian.recipe | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/recipes/guardian.recipe b/recipes/guardian.recipe index 6211997b06..c5021cb91d 100644 --- a/recipes/guardian.recipe +++ b/recipes/guardian.recipe @@ -28,7 +28,7 @@ class Guardian(BasicNewsRecipe): # List of section titles to ignore # For example: ['Sport'] ignore_sections = [] - + timefmt = ' [%a, %d %b %Y]' keep_only_tags = [ dict(name='div', attrs={'id':["content","article_header","main-article-info",]}), @@ -87,8 +87,14 @@ class Guardian(BasicNewsRecipe): idx = soup.find('div', id='book-index') for s in idx.findAll('strong', attrs={'class':'book'}): a = s.find('a', href=True) - yield (self.tag_to_string(a), a['href']) - + section_title = self.tag_to_string(a) + if not section_title in self.ignore_sections: + prefix = '' + if section_title != 'Main section': + prefix = section_title + ': ' + for subsection in s.parent.findAll('a', attrs={'class':'book-section'}): + yield (prefix + self.tag_to_string(subsection), subsection['href']) + def find_articles(self, url): soup = self.index_to_soup(url) div = soup.find('div', attrs={'class':'book-index'}) @@ -109,15 +115,12 @@ class Guardian(BasicNewsRecipe): 'title': title, 'url':url, 'description':desc, 'date' : strftime('%a, %d %b'), } - + def parse_index(self): try: feeds = [] for title, href in self.find_sections(): - if not title in self.ignore_sections: - feeds.append((title, list(self.find_articles(href)))) + feeds.append((title, list(self.find_articles(href)))) return feeds except: raise NotImplementedError - - From 87f281cf4dd5c8d4eb05b3e3440b756674a19827 Mon Sep 17 00:00:00 2001 From: Tom Scholl Date: Tue, 10 May 2011 11:17:39 +0000 Subject: [PATCH 3/7] Updated Newsweek recipe --- recipes/newsweek.recipe | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/recipes/newsweek.recipe b/recipes/newsweek.recipe index 73837c1872..740bf5299d 100644 --- a/recipes/newsweek.recipe +++ b/recipes/newsweek.recipe @@ -11,7 +11,20 @@ class Newsweek(BasicNewsRecipe): no_stylesheets = True BASE_URL = 'http://www.newsweek.com' - INDEX = BASE_URL+'/topics.html' + + topics = { + 'Culture' : '/tag/culture.html', + 'Business' : '/tag/business.html', + 'Society' : '/tag/society.html', + 'Science' : '/tag/science.html', + 'Education' : '/tag/education.html', + 'Politics' : '/tag/politics.html', + 'Health' : '/tag/health.html', + 'World' : '/tag/world.html', + 'Nation' : '/tag/nation.html', + 'Technology' : '/tag/technology.html', + 'Game Changers' : '/tag/game-changers.html', + } keep_only_tags = dict(name='article', attrs={'class':'article-text'}) remove_tags = [dict(attrs={'data-dartad':True})] @@ -23,10 +36,9 @@ class Newsweek(BasicNewsRecipe): return soup def newsweek_sections(self): - soup = self.index_to_soup(self.INDEX) - for a in soup.findAll('a', title='Primary tag', href=True): - yield (string.capitalize(self.tag_to_string(a)), - self.BASE_URL+a['href']) + for topic_name, topic_url in self.topics.iteritems(): + yield (topic_name, + self.BASE_URL+topic_url) def newsweek_parse_section_page(self, soup): From 8b3c13cedaa7f3ca6bed2939179a7240ffaf4405 Mon Sep 17 00:00:00 2001 From: Tom Scholl Date: Wed, 18 May 2011 17:22:25 +0000 Subject: [PATCH 4/7] Added Finance section to Telegraph recipe --- recipes/telegraph_uk.recipe | 1 + 1 file changed, 1 insertion(+) diff --git a/recipes/telegraph_uk.recipe b/recipes/telegraph_uk.recipe index f79f0fa50c..5fe5b168b8 100644 --- a/recipes/telegraph_uk.recipe +++ b/recipes/telegraph_uk.recipe @@ -49,6 +49,7 @@ class TelegraphUK(BasicNewsRecipe): (u'UK News' , u'http://www.telegraph.co.uk/news/uknews/rss' ) ,(u'World News' , u'http://www.telegraph.co.uk/news/worldnews/rss' ) ,(u'Politics' , u'http://www.telegraph.co.uk/news/newstopics/politics/rss' ) + ,(u'Finance' , u'http://www.telegraph.co.uk/finance/rss' ) ,(u'Technology News', u'http://www.telegraph.co.uk/scienceandtechnology/technology/technologynews/rss' ) ,(u'UK News' , u'http://www.telegraph.co.uk/scienceandtechnology/technology/technologyreviews/rss') ,(u'Science News' , u'http://www.telegraph.co.uk/scienceandtechnology/science/sciencenews/rss' ) From afde9acfdf1e88c92867e25bc608edfd0637c3bd Mon Sep 17 00:00:00 2001 From: Tom Scholl Date: Fri, 20 May 2011 06:49:24 +0000 Subject: [PATCH 5/7] Re-enabled time magazine --- recipes/time_magazine.recipe | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes/time_magazine.recipe b/recipes/time_magazine.recipe index 380bf71f8c..ac7821b65a 100644 --- a/recipes/time_magazine.recipe +++ b/recipes/time_magazine.recipe @@ -10,8 +10,8 @@ import re from calibre.web.feeds.news import BasicNewsRecipe class Time(BasicNewsRecipe): - recipe_disabled = ('This recipe has been disabled as TIME no longer' - ' publish complete articles on the web.') + #recipe_disabled = ('This recipe has been disabled as TIME no longer' + # ' publish complete articles on the web.') title = u'Time' __author__ = 'Kovid Goyal and Sujata Raman' description = 'Weekly magazine' From 0d4c451e6befbf59860efedd224fc85c83420702 Mon Sep 17 00:00:00 2001 From: Tom Scholl Date: Tue, 24 May 2011 14:23:42 +0000 Subject: [PATCH 6/7] Resize large images to reduce size of Daily Mail news recipe --- recipes/daily_mail.recipe | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/recipes/daily_mail.recipe b/recipes/daily_mail.recipe index ac2dfd1777..40d43864c7 100644 --- a/recipes/daily_mail.recipe +++ b/recipes/daily_mail.recipe @@ -1,4 +1,5 @@ from calibre.web.feeds.news import BasicNewsRecipe +from calibre.utils.magick import Image, PixelWand class TheDailyMail(BasicNewsRecipe): title = u'The Daily Mail' @@ -46,5 +47,21 @@ class TheDailyMail(BasicNewsRecipe): #def print_version(self, url): # main = url.partition('?')[0] # return main + '?printingPage=true' - - + + + def postprocess_html(self, soup, first): + #process all the images. assumes that the new html has the correct path + for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): + iurl = tag['src'] + img = Image() + img.open(iurl) + width, height = img.size + print 'img is: ', iurl, 'width is: ', width, 'height is: ', height + if img < 0: + raise RuntimeError('Out of memory') + pw = PixelWand() + if (width > 520 or height > 640): + print 'Resizing image to 50%' + img.size = (width / 2, height / 2) + img.save(iurl) + return soup From 63fb99692956f1e6f0a98fe6157856dc38d260fc Mon Sep 17 00:00:00 2001 From: Tom Scholl Date: Thu, 26 May 2011 22:27:15 +0000 Subject: [PATCH 7/7] Fixed erroneous log_debug calls in Zaobao news recipe --- recipes/zaobao.recipe | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/recipes/zaobao.recipe b/recipes/zaobao.recipe index 91a5459e18..a6d7d2e08e 100644 --- a/recipes/zaobao.recipe +++ b/recipes/zaobao.recipe @@ -82,7 +82,7 @@ class ZAOBAO(BasicNewsRecipe): return soup def parse_feeds(self): - self.log_debug(_('ZAOBAO overrided parse_feeds()')) + self.log(_('ZAOBAO overrided parse_feeds()')) parsed_feeds = BasicNewsRecipe.parse_feeds(self) for id, obj in enumerate(self.INDEXES): @@ -99,7 +99,7 @@ class ZAOBAO(BasicNewsRecipe): a_title = self.tag_to_string(a) date = '' description = '' - self.log_debug(_('adding %s at %s')%(a_title,a_url)) + self.log(_('adding %s at %s')%(a_title,a_url)) articles.append({ 'title':a_title, 'date':date, @@ -110,23 +110,23 @@ class ZAOBAO(BasicNewsRecipe): pfeeds = feeds_from_index([(title, articles)], oldest_article=self.oldest_article, max_articles_per_feed=self.max_articles_per_feed) - self.log_debug(_('adding %s to feed')%(title)) + self.log(_('adding %s to feed')%(title)) for feed in pfeeds: - self.log_debug(_('adding feed: %s')%(feed.title)) + self.log(_('adding feed: %s')%(feed.title)) feed.description = self.DESC_SENSE parsed_feeds.append(feed) for a, article in enumerate(feed): - self.log_debug(_('added article %s from %s')%(article.title, article.url)) - self.log_debug(_('added feed %s')%(feed.title)) + self.log(_('added article %s from %s')%(article.title, article.url)) + self.log(_('added feed %s')%(feed.title)) for i, feed in enumerate(parsed_feeds): # workaorund a strange problem: Somethimes the xml encoding is not apllied correctly by parse() weired_encoding_detected = False if not isinstance(feed.description, unicode) and self.encoding and feed.description: - self.log_debug(_('Feed %s is not encoded correctly, manually replace it')%(feed.title)) + self.log(_('Feed %s is not encoded correctly, manually replace it')%(feed.title)) feed.description = feed.description.decode(self.encoding, 'replace') elif feed.description.find(self.DESC_SENSE) == -1 and self.encoding and feed.description: - self.log_debug(_('Feed %s is weired encoded, manually redo all')%(feed.title)) + self.log(_('Feed %s is weired encoded, manually redo all')%(feed.title)) feed.description = feed.description.encode('cp1252', 'replace').decode(self.encoding, 'replace') weired_encoding_detected = True @@ -148,7 +148,7 @@ class ZAOBAO(BasicNewsRecipe): article.text_summary = article.text_summary.encode('cp1252', 'replace').decode(self.encoding, 'replace') if article.title == "Untitled article": - self.log_debug(_('Removing empty article %s from %s')%(article.title, article.url)) + self.log(_('Removing empty article %s from %s')%(article.title, article.url)) # remove the article feed.articles[a:a+1] = [] return parsed_feeds