Updated nytimes top stories recipe

This commit is contained in:
Kovid Goyal 2009-07-13 22:32:56 -06:00
parent ca5960365d
commit ee2fea02b6

View File

@ -7,23 +7,27 @@ nytimes.com
''' '''
import re import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class NYTimes(BasicNewsRecipe): class NYTimes(BasicNewsRecipe):
title = 'NYTimes Top Stories' title = 'New York Times Top Stories'
__author__ = 'Greg Riker' __author__ = 'GRiker'
language = _('English') language = _('English')
description = 'Top Stories from the New York Times' description = 'Top Stories from the New York Times'
#max_articles_per_feed = 3 #max_articles_per_feed = 3
timefmt = '' timefmt = ''
needs_subscription = False needs_subscription = True
remove_tags_before = dict(id='article') remove_tags_after = dict(attrs={'id':['comments']})
remove_tags_after = dict(id='article') remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink',
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink', 'clearfix']}), 'clearfix', 'nextArticleLink clearfix','inlineSearchControl',
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), 'columnGroup','entry-meta','entry-response module','jumpLink','nav',
dict(name=['script', 'noscript', 'style'])] 'columnGroup advertisementColumnGroup']}),
encoding = 'cp1252' dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive',
'side_search', 'blog_sidebar', 'side_tool', 'side_index', 'login',
'blog-header','searchForm','NYTLogo','insideNYTimes']),
dict(name=['script', 'noscript', 'style','hr'])]
encoding = None
no_stylesheets = True no_stylesheets = True
#extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}' #extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
extra_css = '.headline {text-align:left;}\n\ extra_css = '.headline {text-align:left;}\n\
@ -34,6 +38,16 @@ class NYTimes(BasicNewsRecipe):
flatPeriodical = True flatPeriodical = True
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.nytimes.com/auth/login')
br.select_form(name='login')
br['USERID'] = self.username
br['PASSWORD'] = self.password
br.submit()
return br
def parse_index(self): def parse_index(self):
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
@ -50,18 +64,21 @@ class NYTimes(BasicNewsRecipe):
else : else :
key = None key = None
sections = { 'topstories' : 'Top Stories', sections = {
'world' : 'World', 'arts' : 'Arts',
'us' : 'U.S.', 'business' : 'Business',
'politics' : 'Politics', 'editorials' : 'Editorials',
'business' : 'Business', 'magazine' : 'Magazine',
'technology' : 'Technology', 'mediaadvertising' : 'Media & Advertising',
'sports' : 'Sports', 'newyorkregion' : 'New York/Region',
'arts' : 'Arts', 'oped' : 'Op-Ed',
'newyorkregion': 'New York/Region', 'politics' : 'Politics',
'travel' : 'Travel', 'sports' : 'Sports',
'editorials' : 'Editorials', 'technology' : 'Technology',
'oped' : 'Op-Ed' 'topstories' : 'Top Stories',
'travel' : 'Travel',
'us' : 'U.S.',
'world' : 'World'
} }
#excludeSectionKeywords = ['World','U.S.', 'Politics','Business','Technology','Sports','Arts','New York','Travel', 'Editorials', 'Op-Ed'] #excludeSectionKeywords = ['World','U.S.', 'Politics','Business','Technology','Sports','Arts','New York','Travel', 'Editorials', 'Op-Ed']
@ -131,6 +148,11 @@ class NYTimes(BasicNewsRecipe):
section = i[i.find('=')+1:-2] section = i[i.find('=')+1:-2]
if self.verbose > 2 : self.log( "sectionTitle: %s" % sections[section]) if self.verbose > 2 : self.log( "sectionTitle: %s" % sections[section])
if not sections.has_key(section) :
self.log( "Unrecognized section id: %s, skipping" % section )
skipThisSection = True
break
# Check for excluded section # Check for excluded section
if len(excludeSectionKeywords): if len(excludeSectionKeywords):
key = sections[section] key = sections[section]
@ -202,26 +224,65 @@ class NYTimes(BasicNewsRecipe):
return ans return ans
def preprocess_html(self, soup):
refresh = soup.find('meta', {'http-equiv':'refresh'})
if refresh is None:
return soup
content = refresh.get('content').partition('=')[2]
raw = self.browser.open('http://www.nytimes.com'+content).read()
return BeautifulSoup(raw.decode('cp1252', 'replace'))
def postprocess_html(self,soup, True): def postprocess_html(self,soup, True):
if self.verbose > 2 : self.log(" ********** recipe.postprocess_html ********** ") if self.verbose > 2 : self.log(" ********** recipe.postprocess_html ********** ")
# Change class="kicker" to <h3>
kicker = soup.find(True, {'class':'kicker'})
if kicker is not None :
print "changing kicker to <h3>"
print kicker
h3Tag = Tag(soup, "h3")
h3Tag.insert(0, kicker.contents[0])
kicker.replaceWith(h3Tag)
# Change captions to italic -1 # Change captions to italic -1
for caption in soup.findAll(True, {'class':'caption'}) : for caption in soup.findAll(True, {'class':'caption'}) :
emTag = Tag(soup, "em") if caption is not None:
#emTag['class'] = "caption" emTag = Tag(soup, "em")
#emTag['font-size-adjust'] = "-1" #emTag['class'] = "caption"
emTag.insert(0, caption.contents[0]) #emTag['font-size-adjust'] = "-1"
hrTag = Tag(soup, 'hr') emTag.insert(0, caption.contents[0])
emTag.insert(1, hrTag) hrTag = Tag(soup, 'hr')
caption.replaceWith(emTag) emTag.insert(1, hrTag)
caption.replaceWith(emTag)
# Change <nyt_headline> to <h2> # Change <nyt_headline> to <h2>
headline = soup.div.div.div.div.div.h1.nyt_headline headline = soup.find("nyt_headline")
tag = Tag(soup, "h2") if headline is not None :
tag['class'] = "headline" tag = Tag(soup, "h2")
tag.insert(0, headline.contents[0]) tag['class'] = "headline"
soup.h1.replaceWith(tag) tag.insert(0, headline.contents[0])
soup.h1.replaceWith(tag)
# Change <h1> to <h3> - used in editorial blogs
masthead = soup.find("h1")
if masthead is not None :
# Nuke the href
if masthead.a is not None :
del(masthead.a['href'])
tag = Tag(soup, "h3")
tag.insert(0, masthead.contents[0])
soup.h1.replaceWith(tag)
'''
# Change subheads to <h3>
for subhead in soup.findAll(True, {'class':'bold'}) :
h3Tag = Tag(soup, "h3")
h3Tag.insert(0, subhead.contents[0])
subhead.replaceWith(h3Tag)
'''
# Change <span class="bold"> to <b>
for subhead in soup.findAll(True, {'class':'bold'}) :
bTag = Tag(soup, "b")
bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag)
return soup return soup