mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Updated nytimes top stories recipe
This commit is contained in:
parent
ca5960365d
commit
ee2fea02b6
@ -7,23 +7,27 @@ nytimes.com
|
|||||||
'''
|
'''
|
||||||
import re
|
import re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Tag
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||||
|
|
||||||
class NYTimes(BasicNewsRecipe):
|
class NYTimes(BasicNewsRecipe):
|
||||||
|
|
||||||
title = 'NYTimes Top Stories'
|
title = 'New York Times Top Stories'
|
||||||
__author__ = 'Greg Riker'
|
__author__ = 'GRiker'
|
||||||
language = _('English')
|
language = _('English')
|
||||||
description = 'Top Stories from the New York Times'
|
description = 'Top Stories from the New York Times'
|
||||||
#max_articles_per_feed = 3
|
#max_articles_per_feed = 3
|
||||||
timefmt = ''
|
timefmt = ''
|
||||||
needs_subscription = False
|
needs_subscription = True
|
||||||
remove_tags_before = dict(id='article')
|
remove_tags_after = dict(attrs={'id':['comments']})
|
||||||
remove_tags_after = dict(id='article')
|
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink',
|
||||||
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink', 'clearfix']}),
|
'clearfix', 'nextArticleLink clearfix','inlineSearchControl',
|
||||||
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
|
'columnGroup','entry-meta','entry-response module','jumpLink','nav',
|
||||||
dict(name=['script', 'noscript', 'style'])]
|
'columnGroup advertisementColumnGroup']}),
|
||||||
encoding = 'cp1252'
|
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive',
|
||||||
|
'side_search', 'blog_sidebar', 'side_tool', 'side_index', 'login',
|
||||||
|
'blog-header','searchForm','NYTLogo','insideNYTimes']),
|
||||||
|
dict(name=['script', 'noscript', 'style','hr'])]
|
||||||
|
encoding = None
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
#extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
|
#extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
|
||||||
extra_css = '.headline {text-align:left;}\n\
|
extra_css = '.headline {text-align:left;}\n\
|
||||||
@ -34,6 +38,16 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
flatPeriodical = True
|
flatPeriodical = True
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
br.open('http://www.nytimes.com/auth/login')
|
||||||
|
br.select_form(name='login')
|
||||||
|
br['USERID'] = self.username
|
||||||
|
br['PASSWORD'] = self.password
|
||||||
|
br.submit()
|
||||||
|
return br
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
||||||
|
|
||||||
@ -50,18 +64,21 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
else :
|
else :
|
||||||
key = None
|
key = None
|
||||||
|
|
||||||
sections = { 'topstories' : 'Top Stories',
|
sections = {
|
||||||
'world' : 'World',
|
'arts' : 'Arts',
|
||||||
'us' : 'U.S.',
|
'business' : 'Business',
|
||||||
'politics' : 'Politics',
|
'editorials' : 'Editorials',
|
||||||
'business' : 'Business',
|
'magazine' : 'Magazine',
|
||||||
'technology' : 'Technology',
|
'mediaadvertising' : 'Media & Advertising',
|
||||||
'sports' : 'Sports',
|
'newyorkregion' : 'New York/Region',
|
||||||
'arts' : 'Arts',
|
'oped' : 'Op-Ed',
|
||||||
'newyorkregion': 'New York/Region',
|
'politics' : 'Politics',
|
||||||
'travel' : 'Travel',
|
'sports' : 'Sports',
|
||||||
'editorials' : 'Editorials',
|
'technology' : 'Technology',
|
||||||
'oped' : 'Op-Ed'
|
'topstories' : 'Top Stories',
|
||||||
|
'travel' : 'Travel',
|
||||||
|
'us' : 'U.S.',
|
||||||
|
'world' : 'World'
|
||||||
}
|
}
|
||||||
|
|
||||||
#excludeSectionKeywords = ['World','U.S.', 'Politics','Business','Technology','Sports','Arts','New York','Travel', 'Editorials', 'Op-Ed']
|
#excludeSectionKeywords = ['World','U.S.', 'Politics','Business','Technology','Sports','Arts','New York','Travel', 'Editorials', 'Op-Ed']
|
||||||
@ -131,6 +148,11 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
section = i[i.find('=')+1:-2]
|
section = i[i.find('=')+1:-2]
|
||||||
if self.verbose > 2 : self.log( "sectionTitle: %s" % sections[section])
|
if self.verbose > 2 : self.log( "sectionTitle: %s" % sections[section])
|
||||||
|
|
||||||
|
if not sections.has_key(section) :
|
||||||
|
self.log( "Unrecognized section id: %s, skipping" % section )
|
||||||
|
skipThisSection = True
|
||||||
|
break
|
||||||
|
|
||||||
# Check for excluded section
|
# Check for excluded section
|
||||||
if len(excludeSectionKeywords):
|
if len(excludeSectionKeywords):
|
||||||
key = sections[section]
|
key = sections[section]
|
||||||
@ -202,26 +224,65 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
||||||
|
if refresh is None:
|
||||||
|
return soup
|
||||||
|
content = refresh.get('content').partition('=')[2]
|
||||||
|
raw = self.browser.open('http://www.nytimes.com'+content).read()
|
||||||
|
return BeautifulSoup(raw.decode('cp1252', 'replace'))
|
||||||
|
|
||||||
def postprocess_html(self,soup, True):
|
def postprocess_html(self,soup, True):
|
||||||
if self.verbose > 2 : self.log(" ********** recipe.postprocess_html ********** ")
|
if self.verbose > 2 : self.log(" ********** recipe.postprocess_html ********** ")
|
||||||
|
# Change class="kicker" to <h3>
|
||||||
|
kicker = soup.find(True, {'class':'kicker'})
|
||||||
|
if kicker is not None :
|
||||||
|
print "changing kicker to <h3>"
|
||||||
|
print kicker
|
||||||
|
h3Tag = Tag(soup, "h3")
|
||||||
|
h3Tag.insert(0, kicker.contents[0])
|
||||||
|
kicker.replaceWith(h3Tag)
|
||||||
|
|
||||||
# Change captions to italic -1
|
# Change captions to italic -1
|
||||||
for caption in soup.findAll(True, {'class':'caption'}) :
|
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||||
emTag = Tag(soup, "em")
|
if caption is not None:
|
||||||
#emTag['class'] = "caption"
|
emTag = Tag(soup, "em")
|
||||||
#emTag['font-size-adjust'] = "-1"
|
#emTag['class'] = "caption"
|
||||||
emTag.insert(0, caption.contents[0])
|
#emTag['font-size-adjust'] = "-1"
|
||||||
hrTag = Tag(soup, 'hr')
|
emTag.insert(0, caption.contents[0])
|
||||||
emTag.insert(1, hrTag)
|
hrTag = Tag(soup, 'hr')
|
||||||
caption.replaceWith(emTag)
|
emTag.insert(1, hrTag)
|
||||||
|
caption.replaceWith(emTag)
|
||||||
|
|
||||||
# Change <nyt_headline> to <h2>
|
# Change <nyt_headline> to <h2>
|
||||||
headline = soup.div.div.div.div.div.h1.nyt_headline
|
headline = soup.find("nyt_headline")
|
||||||
tag = Tag(soup, "h2")
|
if headline is not None :
|
||||||
tag['class'] = "headline"
|
tag = Tag(soup, "h2")
|
||||||
tag.insert(0, headline.contents[0])
|
tag['class'] = "headline"
|
||||||
soup.h1.replaceWith(tag)
|
tag.insert(0, headline.contents[0])
|
||||||
|
soup.h1.replaceWith(tag)
|
||||||
|
|
||||||
|
# Change <h1> to <h3> - used in editorial blogs
|
||||||
|
masthead = soup.find("h1")
|
||||||
|
if masthead is not None :
|
||||||
|
# Nuke the href
|
||||||
|
if masthead.a is not None :
|
||||||
|
del(masthead.a['href'])
|
||||||
|
tag = Tag(soup, "h3")
|
||||||
|
tag.insert(0, masthead.contents[0])
|
||||||
|
soup.h1.replaceWith(tag)
|
||||||
|
'''
|
||||||
|
# Change subheads to <h3>
|
||||||
|
for subhead in soup.findAll(True, {'class':'bold'}) :
|
||||||
|
h3Tag = Tag(soup, "h3")
|
||||||
|
h3Tag.insert(0, subhead.contents[0])
|
||||||
|
subhead.replaceWith(h3Tag)
|
||||||
|
'''
|
||||||
|
# Change <span class="bold"> to <b>
|
||||||
|
for subhead in soup.findAll(True, {'class':'bold'}) :
|
||||||
|
bTag = Tag(soup, "b")
|
||||||
|
bTag.insert(0, subhead.contents[0])
|
||||||
|
subhead.replaceWith(bTag)
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user