Update BostonGlobe

This commit is contained in:
Kovid Goyal 2014-11-22 10:02:20 +05:30
parent b2d36cfcf0
commit 9a11917eac

View File

@ -2,6 +2,8 @@ import string, re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
from datetime import date
from calibre.utils.magick.draw import save_cover_data_to
from calibre.ptempfile import PersistentTemporaryFile
class BostonGlobeSubscription(BasicNewsRecipe):
@ -75,6 +77,7 @@ class BostonGlobeSubscription(BasicNewsRecipe):
return url
def get_image(self, url):
# pdb.set_trace()
# Another hack - sometimes the URLs just have a leading /,
# in which case I stick on "http://" and the correct domain
if url.startswith("/"):
@ -84,15 +87,14 @@ class BostonGlobeSubscription(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser(self)
response = br.open(url)
data = response.get_data()
# write it to a local file whose name is based on the URL
filename = ''.join(c for c in url if c in self.valid_filename_chars)
self.log("filename=%s" % filename)
f = open(filename, "wb")
f.write(data)
f.close()
return url
pt = PersistentTemporaryFile('.jpg')
pt.close()
try:
save_cover_data_to(data, pt.name)
return 'file:///' + pt.name
except:
self.log('Failed to load image: %s' % url)
return ''
def is_login_form(self, form):
return form.action == "https://www.bostonglobe.com/Login"
@ -225,12 +227,12 @@ class BostonGlobeSubscription(BasicNewsRecipe):
# Author is in a P whose class is "hed-cat" or in a CITE
authorTag = story.find("p", "hed-cat")
if (authorTag == None):
if (authorTag is None):
authorTag = story.find("cite")
# URL is in an A whose class is "story-perm". If not, it's in the only A tag
urlTag = story.find("a", "story-perm")
if (urlTag == None):
if (urlTag is None):
urlTag = story.find("a")
# Extract the values and build the article
@ -253,41 +255,41 @@ class BostonGlobeSubscription(BasicNewsRecipe):
# Now find Letters to the Editor and process them
mainDiv = opinionSoup.find("div", {"id":"main"})
if (mainDiv == None):
if (mainDiv is None):
print "no mainDiv found"
else:
lettersAnchor = mainDiv.find("a", {"href":re.compile(".*opinion/letters.*")})
if (lettersAnchor == None):
if (lettersAnchor is None):
print "No lettersAnchor found"
else:
lettersFeatureWell = lettersAnchor.parent.parent
lettersContent = lettersFeatureWell.find("div", "content")
if (lettersContent == None):
if (lettersContent is None):
print "No lettersContent found"
else:
mainLetterDiv = lettersContent.find("div", {"class":re.compile("main.*")})
mainLetterStories = mainLetterDiv.findAll("h4", "story-title")
if (mainLetterStories == None):
if (mainLetterStories is None):
print "no mainLetterStories found"
else:
for mainLetterStory in mainLetterStories:
mainLetterAnchor = mainLetterStory.parent
if (mainLetterAnchor == None):
if (mainLetterAnchor is None):
print "no mainLetterAnchor found"
else:
articles.append({"title":self.tag_to_string(mainLetterStory), "url":self.make_url(
mainLetterAnchor["href"]), "author":"Letter", "date":self.todaysDate, "description":""})
inSection = lettersFeatureWell.find("div", "in-section")
if (inSection == None):
if (inSection is None):
print "no inSection found"
else:
lettersList = inSection.find("ul")
if (lettersList == None):
if (lettersList is None):
print "no lettersList found"
else:
for letter in lettersList.findAll("li"):
letterAnchor = letter.find("a")
if (letterAnchor == None):
if (letterAnchor is None):
print "no letterAnchor for ", letter
else:
articles.append({"title":self.tag_to_string(letterAnchor), "url":self.make_url(