mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Fix both NYTimes recipes
This commit is contained in:
parent
80079ebc0f
commit
d2bdd94ee6
@ -5,62 +5,59 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
nytimes.com
|
||||
'''
|
||||
import re
|
||||
import time
|
||||
from calibre import entity_to_unicode
|
||||
import re, string, time
|
||||
from calibre import entity_to_unicode, strftime
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \
|
||||
Comment, BeautifulStoneSoup
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
||||
|
||||
class NYTimes(BasicNewsRecipe):
|
||||
|
||||
title = 'New York Times Top Stories'
|
||||
__author__ = 'GRiker'
|
||||
language = 'en'
|
||||
requires_version = (0, 7, 5)
|
||||
description = 'Top Stories from the New York Times'
|
||||
# set headlinesOnly to True for the headlines-only version
|
||||
headlinesOnly = True
|
||||
|
||||
# List of sections typically included in Top Stories. Use a keyword from the
|
||||
# right column in the excludeSectionKeywords[] list to skip downloading that section
|
||||
sections = {
|
||||
'arts' : 'Arts',
|
||||
'business' : 'Business',
|
||||
'diningwine' : 'Dining & Wine',
|
||||
'editorials' : 'Editorials',
|
||||
'health' : 'Health',
|
||||
'magazine' : 'Magazine',
|
||||
'mediaadvertising' : 'Media & Advertising',
|
||||
'newyorkregion' : 'New York/Region',
|
||||
'oped' : 'Op-Ed',
|
||||
'politics' : 'Politics',
|
||||
'science' : 'Science',
|
||||
'sports' : 'Sports',
|
||||
'technology' : 'Technology',
|
||||
'topstories' : 'Top Stories',
|
||||
'travel' : 'Travel',
|
||||
'us' : 'U.S.',
|
||||
'world' : 'World'
|
||||
}
|
||||
# includeSections: List of sections to include. If empty, all sections found will be included.
|
||||
# Otherwise, only the sections named will be included. For example,
|
||||
#
|
||||
# includeSections = ['Politics','Sports']
|
||||
#
|
||||
# would cause only the Politics and Sports sections to be included.
|
||||
|
||||
# Add section keywords from the right column above to skip that section
|
||||
# For example, to skip sections containing the word 'Sports' or 'Dining', use:
|
||||
# excludeSectionKeywords = ['Sports', 'Dining']
|
||||
# Fetch only Business and Technology
|
||||
# excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
|
||||
# Fetch only Top Stories
|
||||
# excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
|
||||
# By default, no sections are skipped.
|
||||
excludeSectionKeywords = []
|
||||
includeSections = [] # by default, all sections included
|
||||
|
||||
# excludeSections: List of sections to exclude. If empty, all sections found will be included.
|
||||
# Otherwise, the sections named will be excluded. For example,
|
||||
#
|
||||
# excludeSections = ['Politics','Sports']
|
||||
#
|
||||
# would cause the Politics and Sports sections to be excluded. This parameter can be used
|
||||
# in conjuction with includeSections although in most cases using one or the other, but
|
||||
# not both, is sufficient.
|
||||
|
||||
excludeSections = []
|
||||
|
||||
# one_picture_per_article specifies that calibre should only use the first image
|
||||
# from an article (if one exists). If one_picture_per_article = True, the image
|
||||
# will be moved to a location between the headline and the byline.
|
||||
# If one_picture_per_article = False, all images from the article will be included
|
||||
|
||||
# and shown in their original location.
|
||||
one_picture_per_article = True
|
||||
|
||||
# The maximum number of articles that will be downloaded
|
||||
max_articles_per_feed = 40
|
||||
max_articles_per_feed = 100
|
||||
|
||||
|
||||
if headlinesOnly:
|
||||
title='New York Times Headlines'
|
||||
description = 'Headlines from the New York Times'
|
||||
else:
|
||||
title='New York Times'
|
||||
description = 'Today\'s New York Times'
|
||||
|
||||
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
||||
language = 'en'
|
||||
requires_version = (0, 7, 5)
|
||||
|
||||
|
||||
timefmt = ''
|
||||
needs_subscription = True
|
||||
@ -82,6 +79,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
'entry-response module',
|
||||
'icon enlargeThis',
|
||||
'leftNavTabs',
|
||||
'metaFootnote',
|
||||
'module box nav',
|
||||
'nextArticleLink',
|
||||
'nextArticleLink clearfix',
|
||||
@ -89,12 +87,13 @@ class NYTimes(BasicNewsRecipe):
|
||||
'relatedSearchesModule',
|
||||
'side_tool',
|
||||
'singleAd',
|
||||
'subNavigation clearfix',
|
||||
'subNavigation tabContent active',
|
||||
'subNavigation tabContent active clearfix',
|
||||
re.compile('^subNavigation'),
|
||||
re.compile('^leaderboard'),
|
||||
re.compile('^module'),
|
||||
]}),
|
||||
dict(id=[
|
||||
'adxLeaderboard',
|
||||
'adxSponLink',
|
||||
'archive',
|
||||
'articleExtras',
|
||||
'articleInline',
|
||||
@ -105,87 +104,98 @@ class NYTimes(BasicNewsRecipe):
|
||||
'footer',
|
||||
'header',
|
||||
'header_search',
|
||||
'inlineBox',
|
||||
'login',
|
||||
'masthead',
|
||||
'masthead-nav',
|
||||
'memberTools',
|
||||
'navigation',
|
||||
'portfolioInline',
|
||||
'readerReviews',
|
||||
'readerReviewsCount',
|
||||
'relatedArticles',
|
||||
'relatedTopics',
|
||||
'respond',
|
||||
'side_search',
|
||||
'side_index',
|
||||
'side_tool',
|
||||
'toolsRight',
|
||||
]),
|
||||
dict(name=['script', 'noscript', 'style'])]
|
||||
|
||||
dict(name=['script', 'noscript', 'style','form','hr'])]
|
||||
no_stylesheets = True
|
||||
extra_css = '.headline {text-align: left;}\n \
|
||||
.byline {font-family: monospace; \
|
||||
text-align: left; \
|
||||
margin-top: 0px; \
|
||||
margin-bottom: 0px;}\n \
|
||||
.dateline {font-size: small; \
|
||||
margin-top: 0px; \
|
||||
margin-bottom: 0px;}\n \
|
||||
.timestamp {font-size: small; \
|
||||
margin-top: 0px; \
|
||||
margin-bottom: 0px;}\n \
|
||||
.source {text-align: left;}\n \
|
||||
.image {text-align: center;}\n \
|
||||
.credit {text-align: right; \
|
||||
font-size: small; \
|
||||
margin-top: 0px; \
|
||||
margin-bottom: 0px;}\n \
|
||||
.articleBody {text-align: left;}\n \
|
||||
.authorId {text-align: left; \
|
||||
font-style: italic;}\n '
|
||||
extra_css = '''
|
||||
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
|
||||
.credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.timestamp { text-align: left; font-size: small; }
|
||||
.caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
a:link {text-decoration: none; }
|
||||
.articleBody { }
|
||||
.authorId {text-align: left; }
|
||||
.image {text-align: center;}
|
||||
.source {text-align: left; }'''
|
||||
|
||||
def dump_ans(self, ans) :
|
||||
def filter_ans(self, ans) :
|
||||
total_article_count = 0
|
||||
for section in ans :
|
||||
idx = 0
|
||||
idx_max = len(ans)-1
|
||||
while idx <= idx_max:
|
||||
if self.includeSections != []:
|
||||
if ans[idx][0] not in self.includeSections:
|
||||
print "SECTION NOT INCLUDED: ",ans[idx][0]
|
||||
del ans[idx]
|
||||
idx_max = idx_max-1
|
||||
continue
|
||||
if ans[idx][0] in self.excludeSections:
|
||||
print "SECTION EXCLUDED: ",ans[idx][0]
|
||||
del ans[idx]
|
||||
idx_max = idx_max-1
|
||||
continue
|
||||
if self.verbose:
|
||||
self.log("section %s: %d articles" % (section[0], len(section[1])) )
|
||||
for article in section[1]:
|
||||
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
||||
for article in ans[idx][1]:
|
||||
total_article_count += 1
|
||||
if self.verbose:
|
||||
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
||||
article['url'].encode('cp1252','replace')))
|
||||
idx = idx+1
|
||||
|
||||
self.log( "Queued %d articles" % total_article_count )
|
||||
return ans
|
||||
|
||||
def fixChars(self,string):
|
||||
# Replace lsquo (\x91)
|
||||
fixed = re.sub("\x91","‘",string)
|
||||
fixed = re.sub("\x91","‘",string)
|
||||
|
||||
# Replace rsquo (\x92)
|
||||
fixed = re.sub("\x92","’",fixed)
|
||||
fixed = re.sub("\x92","’",fixed)
|
||||
|
||||
# Replace ldquo (\x93)
|
||||
fixed = re.sub("\x93","“",fixed)
|
||||
fixed = re.sub("\x93","“",fixed)
|
||||
|
||||
# Replace rdquo (\x94)
|
||||
fixed = re.sub("\x94","”",fixed)
|
||||
fixed = re.sub("\x94","”",fixed)
|
||||
|
||||
# Replace ndash (\x96)
|
||||
fixed = re.sub("\x96","–",fixed)
|
||||
fixed = re.sub("\x96","–",fixed)
|
||||
|
||||
# Replace mdash (\x97)
|
||||
fixed = re.sub("\x97","—",fixed)
|
||||
fixed = re.sub("\x97","—",fixed)
|
||||
|
||||
return fixed
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
try:
|
||||
br.open('http://www.nytimes.com/auth/login')
|
||||
br.select_form(name='login')
|
||||
br['USERID'] = self.username
|
||||
br['PASSWORD'] = self.password
|
||||
br.submit()
|
||||
except:
|
||||
self.log("\nFailed to login")
|
||||
br.open('http://www.nytimes.com/auth/login')
|
||||
br.select_form(name='login')
|
||||
br['USERID'] = self.username
|
||||
br['PASSWORD'] = self.password
|
||||
raw = br.submit().read()
|
||||
if 'Please try again' in raw:
|
||||
raise Exception('Your username and password are incorrect')
|
||||
return br
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
@ -213,6 +223,9 @@ class NYTimes(BasicNewsRecipe):
|
||||
cover = None
|
||||
return cover
|
||||
|
||||
def short_title(self):
|
||||
return self.title
|
||||
|
||||
def index_to_soup(self, url_or_raw, raw=False):
|
||||
'''
|
||||
OVERRIDE of class method
|
||||
@ -255,157 +268,184 @@ class NYTimes(BasicNewsRecipe):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&","&", massaged)
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&","&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def parse_index(self):
|
||||
def parse_todays_index(self):
|
||||
|
||||
def feed_title(div):
|
||||
return ''.join(div.findAll(text=True, recursive=True)).strip()
|
||||
|
||||
articles = {}
|
||||
key = None
|
||||
ans = []
|
||||
url_list = []
|
||||
|
||||
def handle_article(div):
|
||||
a = div.find('a', href=True)
|
||||
if not a:
|
||||
return
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
if not url.startswith("http"):
|
||||
return
|
||||
if not url.endswith(".html"):
|
||||
return
|
||||
if 'podcast' in url:
|
||||
return
|
||||
if '/video/' in url:
|
||||
return
|
||||
url += '?pagewanted=all'
|
||||
if url in url_list:
|
||||
return
|
||||
url_list.append(url)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description = ''
|
||||
pubdate = strftime('%a, %d %b')
|
||||
summary = div.find(True, attrs={'class':'summary'})
|
||||
if summary:
|
||||
description = self.tag_to_string(summary, use_alt=False)
|
||||
author = ''
|
||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||
if authorAttribution:
|
||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||
else:
|
||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||
if authorAttribution:
|
||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||
feed = key if key is not None else 'Uncategorized'
|
||||
if not articles.has_key(feed):
|
||||
ans.append(feed)
|
||||
articles[feed] = []
|
||||
articles[feed].append(
|
||||
dict(title=title, url=url, date=pubdate,
|
||||
description=description, author=author,
|
||||
content=''))
|
||||
|
||||
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||
|
||||
|
||||
# Find each article
|
||||
for div in soup.findAll(True,
|
||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||
|
||||
if div['class'] in ['section-headline','sectionHeader']:
|
||||
key = string.capwords(feed_title(div))
|
||||
key = key.replace('Op-ed','Op-Ed')
|
||||
key = key.replace('U.s.','U.S.')
|
||||
elif div['class'] in ['story', 'story headline'] :
|
||||
handle_article(div)
|
||||
elif div['class'] == 'headlinesOnly multiline flush':
|
||||
for lidiv in div.findAll('li'):
|
||||
handle_article(lidiv)
|
||||
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
return self.filter_ans(ans)
|
||||
|
||||
def parse_headline_index(self):
|
||||
|
||||
articles = {}
|
||||
ans = []
|
||||
|
||||
feed = key = 'All Top Stories'
|
||||
articles[key] = []
|
||||
ans.append(key)
|
||||
self.log("Scanning 1 section ...")
|
||||
url_list = []
|
||||
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
||||
|
||||
# Fetch the outer table
|
||||
table = soup.find('table')
|
||||
previousTable = table
|
||||
# Fetch the content table
|
||||
content_table = soup.find('table',{'id':'content'})
|
||||
if content_table is None:
|
||||
self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE")
|
||||
return None
|
||||
|
||||
# Find the deepest table containing the stories
|
||||
while True :
|
||||
table = table.find('table')
|
||||
if table.find(text=re.compile('top stories start')) :
|
||||
previousTable = table
|
||||
continue
|
||||
else :
|
||||
table = previousTable
|
||||
break
|
||||
# Within this table are <td id=".*Column.*"> entries, each containing one or more h6 tags which represent sections
|
||||
|
||||
# There are multiple subtables, find the one containing the stories
|
||||
for block in table.findAll('table') :
|
||||
if block.find(text=re.compile('top stories start')) :
|
||||
table = block
|
||||
break
|
||||
else :
|
||||
continue
|
||||
for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
|
||||
for div_sec in td_col.findAll('div',recursive=False):
|
||||
for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
|
||||
section_name = self.tag_to_string(h6_sec_name,use_alt=False)
|
||||
section_name = re.sub(r'^ *$','',section_name)
|
||||
if section_name == '':
|
||||
continue
|
||||
section_name=string.capwords(section_name)
|
||||
if section_name == 'U.s.':
|
||||
section_name = 'U.S.'
|
||||
elif section_name == 'Op-ed':
|
||||
section_name = 'Op-Ed'
|
||||
pubdate = strftime('%a, %d %b')
|
||||
|
||||
# Again there are multiple subtables, find the one containing the stories
|
||||
for storyblock in table.findAll('table') :
|
||||
if storyblock.find(text=re.compile('top stories start')) :
|
||||
break
|
||||
else :
|
||||
continue
|
||||
|
||||
skipThisSection = False
|
||||
todays_article_count = 0
|
||||
# Within this table are <font face="times new roman, times, san serif"> entries
|
||||
self.log("Fetching feed Top Stories")
|
||||
for tr in storyblock.findAllNext('tr'):
|
||||
if tr.find('span') is not None :
|
||||
|
||||
sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif',
|
||||
'times new roman,times, sans serif',
|
||||
'times new roman, times, sans serif']})
|
||||
section = None
|
||||
bylines = []
|
||||
descriptions = []
|
||||
pubdate = None
|
||||
|
||||
# Get the Section title
|
||||
for (x,i) in enumerate(sectionblock.contents) :
|
||||
skipThisSection = False
|
||||
# Extract the section title
|
||||
if ('Comment' in str(i.__class__)) :
|
||||
if 'start(name=' in i :
|
||||
section = i[i.find('=')+1:-2]
|
||||
|
||||
if not self.sections.has_key(section) :
|
||||
skipThisSection = True
|
||||
search_div = div_sec
|
||||
for next_tag in h6_sec_name.findNextSiblings(True):
|
||||
if next_tag.__class__.__name__ == 'Tag':
|
||||
if next_tag.name == 'div':
|
||||
search_div = next_tag
|
||||
break
|
||||
|
||||
# Check for excluded section
|
||||
if len(self.excludeSectionKeywords):
|
||||
key = self.sections[section]
|
||||
excluded = re.compile('|'.join(self.excludeSectionKeywords))
|
||||
if excluded.search(key) or articles.has_key(key):
|
||||
skipThisSection = True
|
||||
break
|
||||
|
||||
# Get the bylines and descriptions
|
||||
if not skipThisSection :
|
||||
lines = sectionblock.contents
|
||||
contentStrings = []
|
||||
|
||||
for line in lines:
|
||||
if not isinstance(line, Comment) and line.strip and line.strip() > "":
|
||||
contentStrings.append(line.strip())
|
||||
|
||||
# Gather the byline/description pairs
|
||||
bylines = []
|
||||
descriptions = []
|
||||
for contentString in contentStrings:
|
||||
if contentString[0:3] == 'By ' and contentString[3].isupper() :
|
||||
bylines.append(contentString)
|
||||
# Get the articles
|
||||
for h3_item in search_div.findAll('h3'):
|
||||
byline = h3_item.h6
|
||||
if byline is not None:
|
||||
author = self.tag_to_string(byline,usa_alt=False)
|
||||
else:
|
||||
descriptions.append(contentString)
|
||||
|
||||
# Fetch the article titles and URLs
|
||||
articleCount = len(sectionblock.findAll('span'))
|
||||
todays_article_count += articleCount
|
||||
for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) :
|
||||
a = span.find('a', href=True)
|
||||
author = ''
|
||||
a = h3_item.find('a', href=True)
|
||||
if not a:
|
||||
continue
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
if not url.startswith("http"):
|
||||
continue
|
||||
if not url.endswith(".html"):
|
||||
continue
|
||||
if 'podcast' in url:
|
||||
continue
|
||||
if 'video' in url:
|
||||
continue
|
||||
url += '?pagewanted=all'
|
||||
if url in url_list:
|
||||
continue
|
||||
url_list.append(url)
|
||||
self.log("URL %s" % url)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
desc = h3_item.find('p')
|
||||
if desc is not None:
|
||||
description = self.tag_to_string(desc,use_alt=False)
|
||||
else:
|
||||
description = ''
|
||||
if not articles.has_key(section_name):
|
||||
ans.append(section_name)
|
||||
articles[section_name] = []
|
||||
articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||
|
||||
title = self.tag_to_string(a, use_alt=True)
|
||||
# prepend the section name
|
||||
title = self.sections[section] + " · " + title
|
||||
|
||||
if not isinstance(title, unicode):
|
||||
title = title.decode('utf-8', 'replace')
|
||||
|
||||
# Allow for unattributed, undescribed entries "Editor's Note"
|
||||
if i >= len(descriptions) :
|
||||
description = None
|
||||
else :
|
||||
description = descriptions[i]
|
||||
|
||||
if len(bylines) == articleCount :
|
||||
author = bylines[i]
|
||||
else :
|
||||
author = None
|
||||
|
||||
# Check for duplicates
|
||||
duplicateFound = False
|
||||
if len(articles[feed]) > 1:
|
||||
for article in articles[feed] :
|
||||
if url == article['url'] :
|
||||
duplicateFound = True
|
||||
break
|
||||
|
||||
if duplicateFound:
|
||||
# Continue fetching, don't add this article
|
||||
todays_article_count -= 1
|
||||
continue
|
||||
|
||||
if not articles.has_key(feed):
|
||||
articles[feed] = []
|
||||
articles[feed].append(
|
||||
dict(title=title, url=url, date=pubdate,
|
||||
description=description, author=author, content=''))
|
||||
# self.log("Queuing %d articles from %s" % (todays_article_count, "Top Stories"))
|
||||
|
||||
ans = self.sort_index_by(ans, {'Top Stories':-1})
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
self.dump_ans(ans)
|
||||
return ans
|
||||
return self.filter_ans(ans)
|
||||
|
||||
def parse_index(self):
|
||||
if self.headlinesOnly:
|
||||
return self.parse_headline_index()
|
||||
else:
|
||||
return self.parse_todays_index()
|
||||
|
||||
def strip_anchors(self,soup):
|
||||
paras = soup.findAll(True)
|
||||
for para in paras:
|
||||
aTags = para.findAll('a')
|
||||
for a in aTags:
|
||||
if a.img is None:
|
||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||
return soup
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
||||
kicker_tag = soup.find(attrs={'class':'kicker'})
|
||||
if kicker_tag: # remove Op_Ed author head shots
|
||||
tagline = self.tag_to_string(kicker_tag)
|
||||
if tagline=='Op-Ed Columnist':
|
||||
img_div = soup.find('div','inlineImage module')
|
||||
if img_div:
|
||||
img_div.extract()
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
def postprocess_html(self,soup, True):
|
||||
@ -422,8 +462,9 @@ class NYTimes(BasicNewsRecipe):
|
||||
firstImg = inlineImgs[0]
|
||||
for inlineImg in inlineImgs[1:]:
|
||||
inlineImg.extract()
|
||||
# Move firstImg after headline
|
||||
cgFirst = soup.find(True, {'class':'columnGroup first'})
|
||||
# Move firstImg before article body
|
||||
#article_body = soup.find(True, {'id':'articleBody'})
|
||||
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
|
||||
if cgFirst:
|
||||
# Strip all sibling NavigableStrings: noise
|
||||
navstrings = cgFirst.findAll(text=True, recursive=False)
|
||||
@ -443,30 +484,18 @@ class NYTimes(BasicNewsRecipe):
|
||||
if headline_found:
|
||||
cgFirst.insert(insertLoc,firstImg)
|
||||
else:
|
||||
self.log(">>> No class:'columnGroup first' found <<<")
|
||||
# Change class="kicker" to <h3>
|
||||
kicker = soup.find(True, {'class':'kicker'})
|
||||
if kicker and kicker.contents[0]:
|
||||
h3Tag = Tag(soup, "h3")
|
||||
h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker,
|
||||
use_alt=False)))
|
||||
kicker.replaceWith(h3Tag)
|
||||
self.log(">>> No class:'columnGroup first' found <<<")
|
||||
|
||||
# Change captions to italic -1
|
||||
# Change captions to italic
|
||||
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||
if caption and caption.contents[0]:
|
||||
emTag = Tag(soup, "em")
|
||||
cTag = Tag(soup, "p", [("class", "caption")])
|
||||
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
||||
mp_off = c.find("More Photos")
|
||||
if mp_off >= 0:
|
||||
c = c[:mp_off]
|
||||
emTag.insert(0, c)
|
||||
#hrTag = Tag(soup, 'hr')
|
||||
#hrTag['class'] = 'caption_divider'
|
||||
hrTag = Tag(soup, 'div')
|
||||
hrTag['class'] = 'divider'
|
||||
emTag.insert(1, hrTag)
|
||||
caption.replaceWith(emTag)
|
||||
cTag.insert(0, c)
|
||||
caption.replaceWith(cTag)
|
||||
|
||||
# Change <nyt_headline> to <h2>
|
||||
h1 = soup.find('h1')
|
||||
@ -506,17 +535,6 @@ class NYTimes(BasicNewsRecipe):
|
||||
bTag.insert(0, subhead.contents[0])
|
||||
subhead.replaceWith(bTag)
|
||||
|
||||
# Synthesize a section header
|
||||
dsk = soup.find('meta', attrs={'name':'dsk'})
|
||||
if dsk and dsk.has_key('content'):
|
||||
hTag = Tag(soup,'h3')
|
||||
hTag['class'] = 'section'
|
||||
hTag.insert(0,NavigableString(dsk['content']))
|
||||
articleTag = soup.find(True, attrs={'id':'article'})
|
||||
if articleTag:
|
||||
articleTag.insert(0,hTag)
|
||||
|
||||
# Add class="articleBody" to <div> so we can format with CSS
|
||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||
if divTag:
|
||||
divTag['class'] = divTag['id']
|
||||
@ -532,11 +550,3 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
return soup
|
||||
|
||||
def strip_anchors(self,soup):
|
||||
paras = soup.findAll(True)
|
||||
for para in paras:
|
||||
aTags = para.findAll('a')
|
||||
for a in aTags:
|
||||
if a.img is None:
|
||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||
return soup
|
||||
|
@ -5,52 +5,186 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
nytimes.com
|
||||
'''
|
||||
import string, re, time
|
||||
from calibre import strftime
|
||||
import re, string, time
|
||||
from calibre import entity_to_unicode, strftime
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
def decode(self, src):
|
||||
enc = 'utf-8'
|
||||
if 'iso-8859-1' in src:
|
||||
enc = 'cp1252'
|
||||
return src.decode(enc, 'ignore')
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
||||
|
||||
class NYTimes(BasicNewsRecipe):
|
||||
|
||||
title = u'New York Times'
|
||||
__author__ = 'Kovid Goyal/Nick Redding'
|
||||
language = 'en'
|
||||
requires_version = (0, 6, 36)
|
||||
# set headlinesOnly to True for the headlines-only version
|
||||
headlinesOnly = False
|
||||
|
||||
description = 'Daily news from the New York Times (subscription version)'
|
||||
timefmt = ' [%b %d]'
|
||||
# includeSections: List of sections to include. If empty, all sections found will be included.
|
||||
# Otherwise, only the sections named will be included. For example,
|
||||
#
|
||||
# includeSections = ['Politics','Sports']
|
||||
#
|
||||
# would cause only the Politics and Sports sections to be included.
|
||||
|
||||
includeSections = [] # by default, all sections included
|
||||
|
||||
# excludeSections: List of sections to exclude. If empty, all sections found will be included.
|
||||
# Otherwise, the sections named will be excluded. For example,
|
||||
#
|
||||
# excludeSections = ['Politics','Sports']
|
||||
#
|
||||
# would cause the Politics and Sports sections to be excluded. This parameter can be used
|
||||
# in conjuction with includeSections although in most cases using one or the other, but
|
||||
# not both, is sufficient.
|
||||
|
||||
excludeSections = []
|
||||
|
||||
# one_picture_per_article specifies that calibre should only use the first image
|
||||
# from an article (if one exists). If one_picture_per_article = True, the image
|
||||
# will be moved to a location between the headline and the byline.
|
||||
# If one_picture_per_article = False, all images from the article will be included
|
||||
|
||||
# and shown in their original location.
|
||||
one_picture_per_article = True
|
||||
|
||||
# The maximum number of articles that will be downloaded
|
||||
max_articles_per_feed = 100
|
||||
|
||||
|
||||
if headlinesOnly:
|
||||
title='New York Times Headlines'
|
||||
description = 'Headlines from the New York Times'
|
||||
else:
|
||||
title='New York Times'
|
||||
description = 'Today\'s New York Times'
|
||||
|
||||
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
||||
language = 'en'
|
||||
requires_version = (0, 7, 5)
|
||||
|
||||
|
||||
timefmt = ''
|
||||
needs_subscription = True
|
||||
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
||||
cover_margins = (18,18,'grey99')
|
||||
|
||||
remove_tags_before = dict(id='article')
|
||||
remove_tags_after = dict(id='article')
|
||||
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool','nextArticleLink',
|
||||
'nextArticleLink clearfix','columnGroup doubleRule','doubleRule','entry-meta',
|
||||
'icon enlargeThis','columnGroup last','relatedSearchesModule']}),
|
||||
dict({'class':re.compile('^subNavigation')}),
|
||||
dict({'class':re.compile('^leaderboard')}),
|
||||
dict({'class':re.compile('^module')}),
|
||||
dict({'class':'metaFootnote'}),
|
||||
dict(id=['inlineBox','footer', 'toolsRight', 'articleInline','login','masthead',
|
||||
'navigation', 'archive', 'side_search', 'blog_sidebar','cCol','portfolioInline',
|
||||
'side_tool', 'side_index','header','readerReviewsCount','readerReviews',
|
||||
'relatedArticles', 'relatedTopics', 'adxSponLink']),
|
||||
remove_tags = [dict(attrs={'class':[
|
||||
'articleFooter',
|
||||
'articleTools',
|
||||
'columnGroup doubleRule',
|
||||
'columnGroup singleRule',
|
||||
'columnGroup last',
|
||||
'columnGroup last',
|
||||
'doubleRule',
|
||||
'dottedLine',
|
||||
'entry-meta',
|
||||
'entry-response module',
|
||||
'icon enlargeThis',
|
||||
'leftNavTabs',
|
||||
'metaFootnote',
|
||||
'module box nav',
|
||||
'nextArticleLink',
|
||||
'nextArticleLink clearfix',
|
||||
'post-tools',
|
||||
'relatedSearchesModule',
|
||||
'side_tool',
|
||||
'singleAd',
|
||||
re.compile('^subNavigation'),
|
||||
re.compile('^leaderboard'),
|
||||
re.compile('^module'),
|
||||
]}),
|
||||
dict(id=[
|
||||
'adxLeaderboard',
|
||||
'adxSponLink',
|
||||
'archive',
|
||||
'articleExtras',
|
||||
'articleInline',
|
||||
'blog_sidebar',
|
||||
'businessSearchBar',
|
||||
'cCol',
|
||||
'entertainmentSearchBar',
|
||||
'footer',
|
||||
'header',
|
||||
'header_search',
|
||||
'inlineBox',
|
||||
'login',
|
||||
'masthead',
|
||||
'masthead-nav',
|
||||
'memberTools',
|
||||
'navigation',
|
||||
'portfolioInline',
|
||||
'readerReviews',
|
||||
'readerReviewsCount',
|
||||
'relatedArticles',
|
||||
'relatedTopics',
|
||||
'respond',
|
||||
'side_search',
|
||||
'side_index',
|
||||
'side_tool',
|
||||
'toolsRight',
|
||||
]),
|
||||
dict(name=['script', 'noscript', 'style','form','hr'])]
|
||||
encoding = decode
|
||||
no_stylesheets = True
|
||||
extra_css = '''
|
||||
.articleHeadline { margin-top:0.5em; margin-bottom:0.25em; }
|
||||
.credit { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.byline { font-size: small; font-style:italic; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.dateline { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
|
||||
.credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.timestamp { font-size: small; }
|
||||
.caption { font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
a:link {text-decoration: none; }'''
|
||||
.timestamp { text-align: left; font-size: small; }
|
||||
.caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
a:link {text-decoration: none; }
|
||||
.articleBody { }
|
||||
.authorId {text-align: left; }
|
||||
.image {text-align: center;}
|
||||
.source {text-align: left; }'''
|
||||
|
||||
def filter_ans(self, ans) :
|
||||
total_article_count = 0
|
||||
idx = 0
|
||||
idx_max = len(ans)-1
|
||||
while idx <= idx_max:
|
||||
if self.includeSections != []:
|
||||
if ans[idx][0] not in self.includeSections:
|
||||
print "SECTION NOT INCLUDED: ",ans[idx][0]
|
||||
del ans[idx]
|
||||
idx_max = idx_max-1
|
||||
continue
|
||||
if ans[idx][0] in self.excludeSections:
|
||||
print "SECTION EXCLUDED: ",ans[idx][0]
|
||||
del ans[idx]
|
||||
idx_max = idx_max-1
|
||||
continue
|
||||
if self.verbose:
|
||||
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
||||
for article in ans[idx][1]:
|
||||
total_article_count += 1
|
||||
if self.verbose:
|
||||
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
||||
article['url'].encode('cp1252','replace')))
|
||||
idx = idx+1
|
||||
|
||||
self.log( "Queued %d articles" % total_article_count )
|
||||
return ans
|
||||
|
||||
def fixChars(self,string):
|
||||
# Replace lsquo (\x91)
|
||||
fixed = re.sub("\x91","‘",string)
|
||||
|
||||
# Replace rsquo (\x92)
|
||||
fixed = re.sub("\x92","’",fixed)
|
||||
|
||||
# Replace ldquo (\x93)
|
||||
fixed = re.sub("\x93","“",fixed)
|
||||
|
||||
# Replace rdquo (\x94)
|
||||
fixed = re.sub("\x94","”",fixed)
|
||||
|
||||
# Replace ndash (\x96)
|
||||
fixed = re.sub("\x96","–",fixed)
|
||||
|
||||
# Replace mdash (\x97)
|
||||
fixed = re.sub("\x97","—",fixed)
|
||||
|
||||
return fixed
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
@ -60,22 +194,19 @@ class NYTimes(BasicNewsRecipe):
|
||||
br['USERID'] = self.username
|
||||
br['PASSWORD'] = self.password
|
||||
raw = br.submit().read()
|
||||
if 'Sorry, we could not find the combination you entered. Please try again.' in raw:
|
||||
if 'Please try again' in raw:
|
||||
raise Exception('Your username and password are incorrect')
|
||||
#open('/t/log.html', 'wb').write(raw)
|
||||
return br
|
||||
|
||||
def get_masthead_url(self):
|
||||
masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
||||
#masthead = 'http://members.cox.net/nickredding/nytlogo.gif'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(masthead)
|
||||
except:
|
||||
self.log("\nMasthead unavailable")
|
||||
masthead = None
|
||||
return masthead
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
# Skip ad pages served before actual article
|
||||
skip_tag = soup.find(True, {'name':'skip'})
|
||||
if skip_tag is not None:
|
||||
self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
|
||||
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||
url += '?pagewanted=all'
|
||||
self.log.warn("Skipping ad to article at '%s'" % url)
|
||||
return self.index_to_soup(url, raw=True)
|
||||
|
||||
def get_cover_url(self):
|
||||
cover = None
|
||||
@ -93,12 +224,57 @@ class NYTimes(BasicNewsRecipe):
|
||||
return cover
|
||||
|
||||
def short_title(self):
|
||||
return 'New York Times'
|
||||
return self.title
|
||||
|
||||
def parse_index(self):
|
||||
self.encoding = 'cp1252'
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||
self.encoding = decode
|
||||
def index_to_soup(self, url_or_raw, raw=False):
|
||||
'''
|
||||
OVERRIDE of class method
|
||||
deals with various page encodings between index and articles
|
||||
'''
|
||||
def get_the_soup(docEncoding, url_or_raw, raw=False) :
|
||||
if re.match(r'\w+://', url_or_raw):
|
||||
f = self.browser.open(url_or_raw)
|
||||
_raw = f.read()
|
||||
f.close()
|
||||
if not _raw:
|
||||
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
|
||||
else:
|
||||
_raw = url_or_raw
|
||||
if raw:
|
||||
return _raw
|
||||
|
||||
if not isinstance(_raw, unicode) and self.encoding:
|
||||
_raw = _raw.decode(docEncoding, 'replace')
|
||||
massage = list(BeautifulSoup.MARKUP_MASSAGE)
|
||||
massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
|
||||
return BeautifulSoup(_raw, markupMassage=massage)
|
||||
|
||||
# Entry point
|
||||
print "index_to_soup()"
|
||||
soup = get_the_soup( self.encoding, url_or_raw )
|
||||
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
||||
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
|
||||
if docEncoding == '' :
|
||||
docEncoding = self.encoding
|
||||
|
||||
if self.verbose > 2:
|
||||
self.log( " document encoding: '%s'" % docEncoding)
|
||||
if docEncoding != self.encoding :
|
||||
soup = get_the_soup(docEncoding, url_or_raw)
|
||||
|
||||
return soup
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&","&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def parse_todays_index(self):
|
||||
|
||||
def feed_title(div):
|
||||
return ''.join(div.findAll(text=True, recursive=True)).strip()
|
||||
@ -119,12 +295,13 @@ class NYTimes(BasicNewsRecipe):
|
||||
return
|
||||
if 'podcast' in url:
|
||||
return
|
||||
if '/video/' in url:
|
||||
return
|
||||
url += '?pagewanted=all'
|
||||
if url in url_list:
|
||||
return
|
||||
url_list.append(url)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
#self.log("Title: %s" % title)
|
||||
description = ''
|
||||
pubdate = strftime('%a, %d %b')
|
||||
summary = div.find(True, attrs={'class':'summary'})
|
||||
@ -140,6 +317,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||
feed = key if key is not None else 'Uncategorized'
|
||||
if not articles.has_key(feed):
|
||||
ans.append(feed)
|
||||
articles[feed] = []
|
||||
articles[feed].append(
|
||||
dict(title=title, url=url, date=pubdate,
|
||||
@ -147,46 +325,228 @@ class NYTimes(BasicNewsRecipe):
|
||||
content=''))
|
||||
|
||||
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||
|
||||
# Find each instance of class="section-headline", class="story", class="story headline"
|
||||
|
||||
# Find each article
|
||||
for div in soup.findAll(True,
|
||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||
|
||||
if div['class'] in ['section-headline','sectionHeader']:
|
||||
key = string.capwords(feed_title(div))
|
||||
articles[key] = []
|
||||
ans.append(key)
|
||||
#self.log('Section: %s' % key)
|
||||
|
||||
key = key.replace('Op-ed','Op-Ed')
|
||||
key = key.replace('U.s.','U.S.')
|
||||
elif div['class'] in ['story', 'story headline'] :
|
||||
handle_article(div)
|
||||
elif div['class'] == 'headlinesOnly multiline flush':
|
||||
for lidiv in div.findAll('li'):
|
||||
handle_article(lidiv)
|
||||
|
||||
# ans = self.sort_index_by(ans, {'The Front Page':-1,
|
||||
# 'Dining In, Dining Out':1,
|
||||
# 'Obituaries':2})
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
return self.filter_ans(ans)
|
||||
|
||||
def parse_headline_index(self):
|
||||
|
||||
articles = {}
|
||||
ans = []
|
||||
url_list = []
|
||||
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
||||
|
||||
# Fetch the content table
|
||||
content_table = soup.find('table',{'id':'content'})
|
||||
if content_table is None:
|
||||
self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE")
|
||||
return None
|
||||
|
||||
# Within this table are <td id=".*Column.*"> entries, each containing one or more h6 tags which represent sections
|
||||
|
||||
for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
|
||||
for div_sec in td_col.findAll('div',recursive=False):
|
||||
for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
|
||||
section_name = self.tag_to_string(h6_sec_name,use_alt=False)
|
||||
section_name = re.sub(r'^ *$','',section_name)
|
||||
if section_name == '':
|
||||
continue
|
||||
section_name=string.capwords(section_name)
|
||||
if section_name == 'U.s.':
|
||||
section_name = 'U.S.'
|
||||
elif section_name == 'Op-ed':
|
||||
section_name = 'Op-Ed'
|
||||
pubdate = strftime('%a, %d %b')
|
||||
|
||||
search_div = div_sec
|
||||
for next_tag in h6_sec_name.findNextSiblings(True):
|
||||
if next_tag.__class__.__name__ == 'Tag':
|
||||
if next_tag.name == 'div':
|
||||
search_div = next_tag
|
||||
break
|
||||
|
||||
# Get the articles
|
||||
for h3_item in search_div.findAll('h3'):
|
||||
byline = h3_item.h6
|
||||
if byline is not None:
|
||||
author = self.tag_to_string(byline,usa_alt=False)
|
||||
else:
|
||||
author = ''
|
||||
a = h3_item.find('a', href=True)
|
||||
if not a:
|
||||
continue
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
if not url.startswith("http"):
|
||||
continue
|
||||
if not url.endswith(".html"):
|
||||
continue
|
||||
if 'podcast' in url:
|
||||
continue
|
||||
if 'video' in url:
|
||||
continue
|
||||
url += '?pagewanted=all'
|
||||
if url in url_list:
|
||||
continue
|
||||
url_list.append(url)
|
||||
self.log("URL %s" % url)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
desc = h3_item.find('p')
|
||||
if desc is not None:
|
||||
description = self.tag_to_string(desc,use_alt=False)
|
||||
else:
|
||||
description = ''
|
||||
if not articles.has_key(section_name):
|
||||
ans.append(section_name)
|
||||
articles[section_name] = []
|
||||
articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||
|
||||
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
return self.filter_ans(ans)
|
||||
|
||||
def parse_index(self):
|
||||
if self.headlinesOnly:
|
||||
return self.parse_headline_index()
|
||||
else:
|
||||
return self.parse_todays_index()
|
||||
|
||||
def strip_anchors(self,soup):
|
||||
paras = soup.findAll(True)
|
||||
for para in paras:
|
||||
aTags = para.findAll('a')
|
||||
for a in aTags:
|
||||
if a.img is None:
|
||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||
return soup
|
||||
|
||||
return ans
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
||||
kicker_tag = soup.find(attrs={'class':'kicker'})
|
||||
if kicker_tag:
|
||||
if kicker_tag: # remove Op_Ed author head shots
|
||||
tagline = self.tag_to_string(kicker_tag)
|
||||
#self.log("FOUND KICKER %s" % tagline)
|
||||
if tagline=='Op-Ed Columnist':
|
||||
img_div = soup.find('div','inlineImage module')
|
||||
#self.log("Searching for photo")
|
||||
if img_div:
|
||||
img_div.extract()
|
||||
#self.log("Photo deleted")
|
||||
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
||||
if refresh is None:
|
||||
return soup
|
||||
content = refresh.get('content').partition('=')[2]
|
||||
raw = self.browser.open_novisit('http://www.nytimes.com'+content).read()
|
||||
return BeautifulSoup(raw.decode('cp1252', 'replace'))
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
def postprocess_html(self,soup, True):
|
||||
|
||||
if self.one_picture_per_article:
|
||||
# Remove all images after first
|
||||
largeImg = soup.find(True, {'class':'articleSpanImage'})
|
||||
inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
|
||||
if largeImg:
|
||||
for inlineImg in inlineImgs:
|
||||
inlineImg.extract()
|
||||
else:
|
||||
if inlineImgs:
|
||||
firstImg = inlineImgs[0]
|
||||
for inlineImg in inlineImgs[1:]:
|
||||
inlineImg.extract()
|
||||
# Move firstImg before article body
|
||||
#article_body = soup.find(True, {'id':'articleBody'})
|
||||
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
|
||||
if cgFirst:
|
||||
# Strip all sibling NavigableStrings: noise
|
||||
navstrings = cgFirst.findAll(text=True, recursive=False)
|
||||
[ns.extract() for ns in navstrings]
|
||||
headline_found = False
|
||||
tag = cgFirst.find(True)
|
||||
insertLoc = 0
|
||||
while True:
|
||||
insertLoc += 1
|
||||
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
|
||||
headline_found = True
|
||||
break
|
||||
tag = tag.nextSibling
|
||||
if not tag:
|
||||
headline_found = False
|
||||
break
|
||||
if headline_found:
|
||||
cgFirst.insert(insertLoc,firstImg)
|
||||
else:
|
||||
self.log(">>> No class:'columnGroup first' found <<<")
|
||||
|
||||
# Change captions to italic
|
||||
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||
if caption and caption.contents[0]:
|
||||
cTag = Tag(soup, "p", [("class", "caption")])
|
||||
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
||||
mp_off = c.find("More Photos")
|
||||
if mp_off >= 0:
|
||||
c = c[:mp_off]
|
||||
cTag.insert(0, c)
|
||||
caption.replaceWith(cTag)
|
||||
|
||||
# Change <nyt_headline> to <h2>
|
||||
h1 = soup.find('h1')
|
||||
if h1:
|
||||
headline = h1.find("nyt_headline")
|
||||
if headline:
|
||||
tag = Tag(soup, "h2")
|
||||
tag['class'] = "headline"
|
||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||
h1.replaceWith(tag)
|
||||
else:
|
||||
# Blog entry - replace headline, remove <hr> tags
|
||||
headline = soup.find('title')
|
||||
if headline:
|
||||
tag = Tag(soup, "h2")
|
||||
tag['class'] = "headline"
|
||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||
soup.insert(0, tag)
|
||||
hrs = soup.findAll('hr')
|
||||
for hr in hrs:
|
||||
hr.extract()
|
||||
|
||||
# Change <h1> to <h3> - used in editorial blogs
|
||||
masthead = soup.find("h1")
|
||||
if masthead:
|
||||
# Nuke the href
|
||||
if masthead.a:
|
||||
del(masthead.a['href'])
|
||||
tag = Tag(soup, "h3")
|
||||
tag.insert(0, self.fixChars(masthead.contents[0]))
|
||||
masthead.replaceWith(tag)
|
||||
|
||||
# Change <span class="bold"> to <b>
|
||||
for subhead in soup.findAll(True, {'class':'bold'}) :
|
||||
if subhead.contents:
|
||||
bTag = Tag(soup, "b")
|
||||
bTag.insert(0, subhead.contents[0])
|
||||
subhead.replaceWith(bTag)
|
||||
|
||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||
if divTag:
|
||||
divTag['class'] = divTag['id']
|
||||
|
||||
# Add class="authorId" to <div> so we can format with CSS
|
||||
divTag = soup.find('div',attrs={'id':'authorId'})
|
||||
if divTag and divTag.contents[0]:
|
||||
tag = Tag(soup, "p")
|
||||
tag['class'] = "authorId"
|
||||
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
||||
use_alt=False)))
|
||||
divTag.replaceWith(tag)
|
||||
|
||||
return soup
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user