mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #4009 (NYT Top Stories fails)
This commit is contained in:
parent
339df810b8
commit
8e004db71b
@ -6,6 +6,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
nytimes.com
|
nytimes.com
|
||||||
'''
|
'''
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
from calibre import entity_to_unicode
|
from calibre import entity_to_unicode
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
|
||||||
@ -14,7 +15,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
title = 'New York Times Top Stories'
|
title = 'New York Times Top Stories'
|
||||||
__author__ = 'GRiker'
|
__author__ = 'GRiker'
|
||||||
language = 'en'
|
language = _('English')
|
||||||
description = 'Top Stories from the New York Times'
|
description = 'Top Stories from the New York Times'
|
||||||
|
|
||||||
# List of sections typically included in Top Stories. Use a keyword from the
|
# List of sections typically included in Top Stories. Use a keyword from the
|
||||||
@ -79,6 +80,14 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
.authorId {text-align: left; \
|
.authorId {text-align: left; \
|
||||||
font-style: italic;}\n '
|
font-style: italic;}\n '
|
||||||
|
|
||||||
|
# def get_cover_url(self):
|
||||||
|
# st = time.localtime()
|
||||||
|
# year = str(st.tm_year)
|
||||||
|
# month = "%.2d" % st.tm_mon
|
||||||
|
# day = "%.2d" % st.tm_mday
|
||||||
|
# cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/' + 'scan.jpg'
|
||||||
|
# return cover
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
if self.username is not None and self.password is not None:
|
if self.username is not None and self.password is not None:
|
||||||
@ -202,21 +211,25 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
# Get the bylines and descriptions
|
# Get the bylines and descriptions
|
||||||
if not skipThisSection :
|
if not skipThisSection :
|
||||||
for (x,i) in enumerate(sectionblock.contents) :
|
lines = sectionblock.contents
|
||||||
|
contentStrings = []
|
||||||
|
|
||||||
# Extract the bylines and descriptions
|
for line in lines:
|
||||||
if (i.string is not None) and \
|
if not isinstance(line, Comment) and line.strip and line.strip() > "":
|
||||||
(i.string.strip() > "") and \
|
contentStrings.append(line.strip())
|
||||||
not isinstance(i,Comment):
|
|
||||||
contentString = i.strip().encode('utf-8')
|
# Gather the byline/description pairs
|
||||||
if contentString[0:3] == 'By ' and contentString[4].isupper() :
|
bylines = []
|
||||||
|
descriptions = []
|
||||||
|
for contentString in contentStrings:
|
||||||
|
if contentString[0:3] == 'By ' and contentString[3].isupper() :
|
||||||
bylines.append(contentString)
|
bylines.append(contentString)
|
||||||
else:
|
else:
|
||||||
descriptions.append(contentString)
|
descriptions.append(contentString)
|
||||||
|
|
||||||
# Fetch the article titles and URLs
|
# Fetch the article titles and URLs
|
||||||
articleCount = len(sectionblock.findAll('span'))
|
articleCount = len(sectionblock.findAll('span'))
|
||||||
for (i,span) in enumerate(sectionblock.findAll('span')) :
|
for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) :
|
||||||
a = span.find('a', href=True)
|
a = span.find('a', href=True)
|
||||||
url = re.sub(r'\?.*', '', a['href'])
|
url = re.sub(r'\?.*', '', a['href'])
|
||||||
url += '?pagewanted=all'
|
url += '?pagewanted=all'
|
||||||
@ -228,6 +241,10 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if not isinstance(title, unicode):
|
if not isinstance(title, unicode):
|
||||||
title = title.decode('utf-8', 'replace')
|
title = title.decode('utf-8', 'replace')
|
||||||
|
|
||||||
|
# Allow for unattributed, undescribed entries "Editor's Note"
|
||||||
|
if i >= len(descriptions) :
|
||||||
|
description = None
|
||||||
|
else :
|
||||||
description = descriptions[i]
|
description = descriptions[i]
|
||||||
|
|
||||||
if len(bylines) == articleCount :
|
if len(bylines) == articleCount :
|
||||||
@ -291,7 +308,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
return soup
|
return soup
|
||||||
else:
|
else:
|
||||||
print "no allowed content found, removing article"
|
print "no allowed content found, removing article"
|
||||||
raise Exception()
|
raise StringError
|
||||||
|
|
||||||
def postprocess_html(self,soup, True):
|
def postprocess_html(self,soup, True):
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user