From dc97a0c2e136f369f346fcfe2659c255d7550dd8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 19 Jun 2010 17:04:41 -0600 Subject: [PATCH] Updated recipes: Psychology Today, WSJ --- resources/recipes/psych.recipe | 73 +++++++++++++++------------- resources/recipes/wsj.recipe | 89 +++++++++++++++++++++++----------- 2 files changed, 99 insertions(+), 63 deletions(-) diff --git a/resources/recipes/psych.recipe b/resources/recipes/psych.recipe index 46290450cb..7b0b5dcaea 100644 --- a/resources/recipes/psych.recipe +++ b/resources/recipes/psych.recipe @@ -1,39 +1,44 @@ -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup -class PsychologyToday(BasicNewsRecipe): +from calibre.ptempfile import PersistentTemporaryFile +from calibre.webfeeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1275708473(BasicNewsRecipe): title = u'Psychology Today' - language = 'en' - __author__ = 'Krittika Goyal' - oldest_article = 1 #days - max_articles_per_feed = 25 - #encoding = 'latin1' - - remove_stylesheets = True - #remove_tags_before = dict(name='h1', attrs={'class':'heading'}) - #remove_tags_after = dict(name='td', attrs={'class':'newptool1'}) + _author__ = 'rty' + publisher = u'www.psychologytoday.com' + category = u'Psychology' + max_articles_per_feed = 100 + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + language = 'en' + temp_files = [] + articles_are_obfuscated = True remove_tags = [ - dict(name='iframe'), - dict(name='div', attrs={'class':['pt-box-title', 'pt-box-content', 'blog-entry-footer', 'item-list', 'article-sub-meta']}), - dict(name='div', attrs={'id':['block-td_search_160', 'block-cam_search_160']}), - #dict(name='ul', attrs={'class':'article-tools'}), - #dict(name='ul', attrs={'class':'articleTools'}), - ] + dict(name='div', attrs={'class':['print-source_url','field-items','print-footer']}), + dict(name='span', attrs={'class':'print-footnote'}), + ] + remove_tags_before = dict(name='h1', attrs={'class':'print-title'}) + remove_tags_after = dict(name='div', attrs={'class':['field-items','print-footer']}) - feeds = [ -('PSY TODAY', - 'http://www.psychologytoday.com/articles/index.rss'), -] + feeds = [(u'Contents', u'http://www.psychologytoday.com/articles/index.rss')] - def preprocess_html(self, soup): - story = soup.find(name='div', attrs={'id':'contentColumn'}) - #td = heading.findParent(name='td') - #td.extract() - soup = BeautifulSoup('t') - body = soup.find(name='body') - body.insert(0, story) - for x in soup.findAll(name='p', text=lambda x:x and '-->' in x): - p = x.findParent('p') - if p is not None: - p.extract() - return soup + def get_article_url(self, article): + return article.get('link', None) + + def get_obfuscated_article(self, url): + br = self.get_browser() + br.open(url) + response = br.follow_link(url_regex = r'/print/[0-9]+', nr = 0) + html = response.read() + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write(html) + self.temp_files[-1].close() + return self.temp_files[-1].name + + def get_cover_url(self): + index = 'http://www.psychologytoday.com/magazine/' + soup = self.index_to_soup(index) + for image in soup.findAll('img',{ "class" : "imagefield imagefield-field_magazine_cover" }): + return image['src'] + '.jpg' + return None diff --git a/resources/recipes/wsj.recipe b/resources/recipes/wsj.recipe index 2e99a690f4..fd5e977d10 100644 --- a/resources/recipes/wsj.recipe +++ b/resources/recipes/wsj.recipe @@ -4,13 +4,14 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' from calibre.web.feeds.news import BasicNewsRecipe +import copy # http://online.wsj.com/page/us_in_todays_paper.html class WallStreetJournal(BasicNewsRecipe): - title = 'The Wall Street Journal (US)' - __author__ = 'Kovid Goyal and Sujata Raman' + title = 'The Wall Street Journal' + __author__ = 'Kovid Goyal, Sujata Raman, and Joshua Oster-Morris' description = 'News and current affairs' needs_subscription = True language = 'en' @@ -67,6 +68,16 @@ class WallStreetJournal(BasicNewsRecipe): def wsj_get_index(self): return self.index_to_soup('http://online.wsj.com/itp') + def wsj_add_feed(self,feeds,title,url): + self.log('Found section:', title) + if url.endswith('whatsnews'): + articles = self.wsj_find_wn_articles(url) + else: + articles = self.wsj_find_articles(url) + if articles: + feeds.append((title, articles)) + return feeds + def parse_index(self): soup = self.wsj_get_index() @@ -82,25 +93,62 @@ class WallStreetJournal(BasicNewsRecipe): div = soup.find('div', attrs={'class':'itpHeader'}) div = div.find('ul', attrs={'class':'tab'}) for a in div.findAll('a', href=lambda x: x and '/itp/' in x): - title = self.tag_to_string(a) - url = 'http://online.wsj.com' + a['href'] - self.log('Found section:', title) - articles = self.wsj_find_articles(url) - if articles: - feeds.append((title, articles)) - + pageone = a['href'].endswith('pageone') + if pageone: + title = 'Front Section' + url = 'http://online.wsj.com' + a['href'] + feeds = self.wsj_add_feed(feeds,title,url) + title = 'What''s News' + url = url.replace('pageone','whatsnews') + feeds = self.wsj_add_feed(feeds,title,url) + else: + title = self.tag_to_string(a) + url = 'http://online.wsj.com' + a['href'] + feeds = self.wsj_add_feed(feeds,title,url) return feeds + def wsj_find_wn_articles(self, url): + soup = self.index_to_soup(url) + articles = [] + + whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x}) + if whats_news is not None: + for a in whats_news.findAll('a', href=lambda x: x and '/article/' in x): + container = a.findParent(['p']) + meta = a.find(attrs={'class':'meta_sectionName'}) + if meta is not None: + meta.extract() + title = self.tag_to_string(a).strip() + url = a['href'] + desc = '' + if container is not None: + desc = self.tag_to_string(container) + + articles.append({'title':title, 'url':url, + 'description':desc, 'date':''}) + + self.log('\tFound WN article:', title) + + return articles + def wsj_find_articles(self, url): soup = self.index_to_soup(url) - whats_news = soup.find('div', attrs={'class':lambda x: x and - 'whatsNews-simple' in x}) + whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x}) if whats_news is not None: - whats_news.extract() + whats_news.extract() articles = [] + flavorarea = soup.find('div', attrs={'class':lambda x: x and 'ahed' in x}) + if flavorarea is not None: + flavorstory = flavorarea.find('a', href=lambda x: x and x.startswith('/article')) + if flavorstory is not None: + flavorstory['class'] = 'mjLinkItem' + metapage = soup.find('span', attrs={'class':lambda x: x and 'meta_sectionName' in x}) + if metapage is not None: + flavorstory.append( copy.copy(metapage) ) #metapage should always be A1 because that should be first on the page + for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True): container = a.findParent(['li', 'div']) meta = a.find(attrs={'class':'meta_sectionName'}) @@ -118,26 +166,9 @@ class WallStreetJournal(BasicNewsRecipe): self.log('\tFound article:', title) - ''' - # Find related articles - a.extract() - for a in container.findAll('a', href=lambda x: x and '/article/' - in x and 'articleTabs' not in x): - url = a['href'] - if not url.startswith('http:'): - url = 'http://online.wsj.com'+url - title = self.tag_to_string(a).strip() - if not title or title.startswith('['): continue - if title: - articles.append({'title':self.tag_to_string(a), - 'url':url, 'description':'', 'date':''}) - self.log('\t\tFound related:', title) - ''' - return articles def cleanup(self): self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com') -