From d221e7e448f245867479671f316520597cf86115 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Feb 2009 11:42:28 -0800 Subject: [PATCH] New recipe for The Chicago Tribune by Kovid Goyal --- src/calibre/web/feeds/__init__.py | 1 - src/calibre/web/feeds/recipes/__init__.py | 2 +- .../feeds/recipes/recipe_chicago_tribune.py | 82 +++++++++++++++++++ 3 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 src/calibre/web/feeds/recipes/recipe_chicago_tribune.py diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py index 82e3f40c10..3f0ec414a2 100644 --- a/src/calibre/web/feeds/__init__.py +++ b/src/calibre/web/feeds/__init__.py @@ -23,7 +23,6 @@ class Article(object): try: self.title = re.sub(r'&(\S+);', entity_to_unicode, self.title) - print 11111, repr(self.title) except: pass self.url = url diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py index 19c4f2827a..92fbbda555 100644 --- a/src/calibre/web/feeds/recipes/__init__.py +++ b/src/calibre/web/feeds/recipes/__init__.py @@ -30,7 +30,7 @@ recipe_modules = ['recipe_' + r for r in ( 'honoluluadvertiser', 'starbulletin', 'exiled', 'indy_star', 'dna', 'pobjeda', 'chicago_breaking_news', 'glasgow_herald', 'linuxdevices', 'hindu', 'cincinnati_enquirer', 'physics_world', 'pressonline', - 'la_republica', 'physics_today', + 'la_republica', 'physics_today', 'chicago_tribune', )] import re, imp, inspect, time, os diff --git a/src/calibre/web/feeds/recipes/recipe_chicago_tribune.py b/src/calibre/web/feeds/recipes/recipe_chicago_tribune.py new file mode 100644 index 0000000000..a4754dd751 --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_chicago_tribune.py @@ -0,0 +1,82 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import re +from urlparse import urlparse, urlunparse +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ptempfile import PersistentTemporaryFile +from threading import RLock + +class ChicagoTribune(BasicNewsRecipe): + + title = 'Chicago Tribune' + __author__ = 'Kovid Goyal' + description = 'Politics, local and business news from Chicago' + language = _('English') + use_embedded_content = False + articles_are_obfuscated = True + remove_tags_before = dict(name='h1') + obfuctation_lock = RLock() + + feeds = [ + ('Latest news', 'http://feeds.chicagotribune.com/chicagotribune/news/'), + ('Local news', 'http://feeds.chicagotribune.com/chicagotribune/news/local/'), + ('Nation/world', 'http://feeds.chicagotribune.com/chicagotribune/news/nationworld/'), + ('Hot topics', 'http://feeds.chicagotribune.com/chicagotribune/hottopics/'), + ('Most E-mailed stories', 'http://feeds.chicagotribune.com/chicagotribune/email/'), + ('Opinion', 'http://feeds.chicagotribune.com/chicagotribune/opinion/'), + ('Off Topic', 'http://feeds.chicagotribune.com/chicagotribune/offtopic/'), + ('Politics', 'http://feeds.chicagotribune.com/chicagotribune/politics/'), + ('Special Reports', 'http://feeds.chicagotribune.com/chicagotribune/special/'), + ('Religion News', 'http://feeds.chicagotribune.com/chicagotribune/religion/'), + ('Business news', 'http://feeds.chicagotribune.com/chicagotribune/business/'), + ('Jobs and Careers', 'http://feeds.chicagotribune.com/chicagotribune/career/'), + ('Local scene', 'http://feeds.chicagotribune.com/chicagohomes/localscene/'), + ('Phil Rosenthal', 'http://feeds.chicagotribune.com/chicagotribune/rosenthal/'), + ('Tech Buzz', 'http://feeds.chicagotribune.com/chicagotribune/techbuzz/'), + ('Your Money', 'http://feeds.chicagotribune.com/chicagotribune/yourmoney/'), + ('Jon Hilkevitch - Getting around', 'http://feeds.chicagotribune.com/chicagotribune/gettingaround/'), + ('Jon Yates - What\'s your problem?', 'http://feeds.chicagotribune.com/chicagotribune/problem/'), + ('Garisson Keillor', 'http://feeds.chicagotribune.com/chicagotribune/keillor/'), + ('Marks Jarvis - On Money', 'http://feeds.chicagotribune.com/chicagotribune/marksjarvisonmoney/'), + ('Sports', 'http://feeds.chicagotribune.com/chicagotribune/sports/'), + ('Arts and Architecture', 'http://feeds.chicagotribune.com/chicagotribune/arts/'), + ('Books', 'http://feeds.chicagotribune.com/chicagotribune/books/'), + ('Magazine', 'http://feeds.chicagotribune.com/chicagotribune/magazine/'), + ('Movies', 'http://feeds.chicagotribune.com/chicagotribune/movies/'), + ('Music', 'http://feeds.chicagotribune.com/chicagotribune/movies/'), + ('TV', 'http://feeds.chicagotribune.com/chicagotribune/tv/'), + ('Hypertext', 'http://feeds.chicagotribune.com/chicagotribune/hypertext/'), + ('iPhone Blog', 'http://feeds.feedburner.com/redeye/iphoneblog'), + ('Julie\'s Health Club', 'http://feeds.chicagotribune.com/chicagotribune_julieshealthclub/'), + ] + + temp_files = [] + + def get_article_url(self, article): + return article.get('feedburner_origlink', article.get('guid', article.get('link'))) + + def get_obfuscated_article(self, url, logger): + with self.obfuctation_lock: + soup = self.index_to_soup(url) + img = soup.find('img', alt='Print') + if img is not None: + a = img.parent.find('a', href=True) + purl = urlparse(url) + xurl = urlunparse(purl[:2] + (a['href'], '', '', '')) + soup = self.index_to_soup(xurl) + for img in soup.findAll('img', src=True): + if img['src'].startswith('/'): + img['src'] = urlunparse(purl[:2]+(img['src'], '', '', '')) + html = unicode(soup) + else: + h1 = soup.find(id='page-title') + body = soup.find(attrs={'class':re.compile('asset-content')}) + html = u'%s'%(unicode(h1)+unicode(body)) + self.temp_files.append(PersistentTemporaryFile('_chicago_tribune.xhtml')) + self.temp_files[-1].write(html.encode('utf-8')) + self.temp_files[-1].close() + return self.temp_files[-1].name +