Update Boston Globe

This commit is contained in:
Kovid Goyal 2015-10-15 09:48:48 +05:30
parent 25de618ae1
commit e0dfef216f

View File

@ -1,7 +1,7 @@
import string, re import string, re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag from calibre.ebooks.BeautifulSoup import Tag
from datetime import date from datetime import date, timedelta
from calibre.utils.magick.draw import save_cover_data_to from calibre.utils.magick.draw import save_cover_data_to
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
@ -14,21 +14,19 @@ class BostonGlobeSubscription(BasicNewsRecipe):
__author__ = 'Rob Freundlich' __author__ = 'Rob Freundlich'
description = 'Boston Globe with full articles for subscribers' description = 'Boston Globe with full articles for subscribers'
language = 'en' language = 'en'
INDEX = date.today().strftime('http://www.bostonglobe.com/todayspaper/%Y/%m/%d') INDEX = 'http://www.bostonglobe.com/todayspaper/%Y/%m/%d'
todaysDate = date.today().strftime("%d/%m/%Y") todaysDate = date.today().strftime("%d/%m/%Y")
timefmt = ' [%a, %d %b, %Y]' timefmt = ' [%a, %d %b, %Y]'
needs_subscription = 'optional' needs_subscription = 'optional'
remove_tags = [dict(attrs={"class":["skip-nav article-more", keep_only_tags = [
"aside promo", dict(attrs={'class':['section-head', 'comic', 'article']})
"article-bar bar-yellow", ]
"tools", remove_tags = [
"sticky-tools", dict(attrs={"class":[
"article-footer", "skip-nav article-more", "aside promo", "article-bar bar-yellow", "tools", "sticky-tools", "article-footer", "bg-footer"
"bg-footer"]}), ]}),
dict(attrs={"id":["masthead", dict(attrs={"id":["masthead", "video", "section-nav", 'newsletter-form', "meter-limit-met-popup"]})
"video", ]
"section-nav",
"meter-limit-met-popup"]})]
no_stylesheets = True no_stylesheets = True
# simultaneous_downloads = 1 # simultaneous_downloads = 1
valid_filename_chars = "-_.%s%s" % (string.ascii_letters, string.digits) valid_filename_chars = "-_.%s%s" % (string.ascii_letters, string.digits)
@ -126,8 +124,9 @@ class BostonGlobeSubscription(BasicNewsRecipe):
def make_url(self, url): def make_url(self, url):
if url.startswith("//"): if url.startswith("//"):
return "http:" + url return "http:" + url
if url.startswith('/'):
return "http://www.bostonglobe.com" + url url = "http://www.bostonglobe.com" + url
return url
def make_bostoncom_url(self, url): def make_bostoncom_url(self, url):
if url.startswith("//"): if url.startswith("//"):
@ -138,57 +137,57 @@ class BostonGlobeSubscription(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
# self.logger.setLevel(logging.WARNING) # self.logger.setLevel(logging.WARNING)
feeds = [] feeds = []
self.log("Getting today's paper from ", self.INDEX) try:
soup = self.index_to_soup(self.INDEX) index = date.today().strftime(self.INDEX)
self.log("Getting today's paper from ", index)
soup = self.index_to_soup(index)
except Exception:
self.todaysDate = (date.today() - timedelta(days=1))
index = self.todaysDate.strftime(self.INDEX)
self.log("Getting today's paper from ", index)
soup = self.index_to_soup(index)
def title_from_h2(h2):
[img.extract() for img in h2.findAll('img')]
return self.tag_to_string(h2)
def get_top_stories(): def get_top_stories():
self.log("Getting top stories") self.log("Getting Top Stories")
articles = [] articles = []
topStoriesDiv = soup.find("div", {"class":re.compile(".*stories-top.*")}) topStoriesDiv = soup.find("div", {"class":"stories-top"})
stories = topStoriesDiv.findAll("div", {"class":"story"})
title = ""
url = ""
excerpt = ""
stories = topStoriesDiv.findAll("div", {"class":re.compile("story.*")})
for story in stories: for story in stories:
title = self.tag_to_string(story.find("h2", {"class":re.compile(".*story-title.*")})) h2 = story.find("h2", {"class":'story-title'})
link = story.find("a") link = story.find("a")
if (link): if h2 is not None and link is not None:
url = link["href"] title = title_from_h2(h2)
url = self.make_url(link["href"])
excerpt_div = story.find("div", {"class":"excerpt"}) excerpt_div = story.find("div", {"class":"excerpt"})
excerpt = self.tag_to_string(excerpt_div) excerpt = self.tag_to_string(excerpt_div)
self.log('\t', title, '[%s]' % url)
self.log('\t\t', excerpt)
articles.append({"title":title, "url":self.make_url(url), "date":self.todaysDate, "description":excerpt}) articles.append({"title":title, "url":self.make_url(url), "date":self.todaysDate, "description":excerpt})
else:
self.log("Skipping ", title, " because it has no link")
if articles: if articles:
feeds.append(("Top Stories", articles)) feeds.append(("Top Stories", articles))
def get_section(sectionTitle, sectionID): def get_section(sectionDiv):
self.log("Getting section", sectionTitle)
articles = []
sectionDiv = soup.find(id=sectionID)
if (sectionDiv):
sectionHeader = sectionDiv.find("h2", "hed-section") sectionHeader = sectionDiv.find("h2", "hed-section")
articles = []
feedTitle = self.tag_to_string(sectionHeader) feedTitle = self.tag_to_string(sectionHeader)
self.log("Getting", feedTitle)
excerpts = sectionDiv.findAll("div", "sec-excerpt") excerpts = sectionDiv.findAll("div", "sec-excerpt")
for excerpt in excerpts: for excerpt in excerpts:
url = ""
title = ""
category = ""
author = ""
# Stories here follow similar forms to top-stories (above) # Stories here follow similar forms to top-stories (above)
storyTitle = excerpt.find("h3", "story-title") storyTitle = excerpt.find("h3", "story-title")
if (storyTitle.parent.name == "a"): if (storyTitle.parent.name == "a"):
a = storyTitle.parent a = storyTitle.parent
url = a["href"] url = a["href"]
title = self.tag_to_string(storyTitle) title = title_from_h2(storyTitle)
else: else:
a = storyTitle.find("a") a = storyTitle.find("a")
url = a["href"] url = a["href"]
title = self.tag_to_string(a) title = title_from_h2(a)
hedCat = excerpt.find("p", "hed-cat") hedCat = excerpt.find("p", "hed-cat")
if (hedCat): if (hedCat):
@ -206,106 +205,14 @@ class BostonGlobeSubscription(BasicNewsRecipe):
if (para != hedCat): if (para != hedCat):
description += self.tag_to_string(para) description += self.tag_to_string(para)
self.log('\t', title, '[%s]' % self.make_url(url))
if description:
self.log('\t\t', description)
articles.append({"title":title, "url":self.make_url(url), "author":author, "date":self.todaysDate, "description":description}) articles.append({"title":title, "url":self.make_url(url), "author":author, "date":self.todaysDate, "description":description})
if articles: if articles:
feeds.append((feedTitle, articles)) feeds.append((feedTitle, articles))
def getOpinionSection():
self.log("Getting section", "Editorials and Opinions")
articles = []
opinionSoup = self.index_to_soup("http://www.bostonglobe.com/opinion")
# Find and process the editorials
topStories = opinionSoup.find("div", "stories-top")
for story in topStories.findAll("div", {"class":re.compile("story.*")}):
# Story title is always in an H2 tag whose class contains "story-title"
titleTag = story.find("h2", {"class":re.compile("story-title.*")})
# Description is always in a DIV whose class is "excert"
excerptTag = story.find("div", "excerpt")
# Author is in a P whose class is "hed-cat" or in a CITE
authorTag = story.find("p", "hed-cat")
if (authorTag is None):
authorTag = story.find("cite")
# URL is in an A whose class is "story-perm". If not, it's in the only A tag
urlTag = story.find("a", "story-perm")
if (urlTag is None):
urlTag = story.find("a")
# Extract the values and build the article
title = ""
if (titleTag):
title = self.tag_to_string(titleTag)
author = ""
if (authorTag):
author = self.tag_to_string(authorTag)
description = ""
if (excerptTag):
description = self.tag_to_string(excerptTag)
url = ""
if (urlTag):
url = urlTag["href"]
articles.append({"title":title, "url":self.make_url(url), "author":author, "date":self.todaysDate, "description":description})
# Now find Letters to the Editor and process them
mainDiv = opinionSoup.find("div", {"id":"main"})
if (mainDiv is None):
print "no mainDiv found"
else:
lettersAnchor = mainDiv.find("a", {"href":re.compile(".*opinion/letters.*")})
if (lettersAnchor is None):
print "No lettersAnchor found"
else:
lettersFeatureWell = lettersAnchor.parent.parent
lettersContent = lettersFeatureWell.find("div", "content")
if (lettersContent is None):
print "No lettersContent found"
else:
mainLetterDiv = lettersContent.find("div", {"class":re.compile("main.*")})
mainLetterStories = mainLetterDiv.findAll("h4", "story-title")
if (mainLetterStories is None):
print "no mainLetterStories found"
else:
for mainLetterStory in mainLetterStories:
mainLetterAnchor = mainLetterStory.parent
if (mainLetterAnchor is None):
print "no mainLetterAnchor found"
else:
articles.append({"title":self.tag_to_string(mainLetterStory), "url":self.make_url(
mainLetterAnchor["href"]), "author":"Letter", "date":self.todaysDate, "description":""})
inSection = lettersFeatureWell.find("div", "in-section")
if (inSection is None):
print "no inSection found"
else:
lettersList = inSection.find("ul")
if (lettersList is None):
print "no lettersList found"
else:
for letter in lettersList.findAll("li"):
letterAnchor = letter.find("a")
if (letterAnchor is None):
print "no letterAnchor for ", letter
else:
articles.append({"title":self.tag_to_string(letterAnchor), "url":self.make_url(
letterAnchor["href"]), "author":"Letter", "date":self.todaysDate, "description":""})
if articles:
# for article in articles:
# print "============"
# print "TITLE", article["title"]
# print "URL", article["url"]
# print "AUTHOR", article["author"]
# print "DATE", article["date"]
# print "DESCRIPTION", article["description"]
# print "============"
feeds.append(("Editorial and Opinion", articles))
def get_comics(): def get_comics():
articles = [] articles = []
comicSoup = self.index_to_soup("http://www.bostonglobe.com/lifestyle/comics") comicSoup = self.index_to_soup("http://www.bostonglobe.com/lifestyle/comics")
@ -326,40 +233,11 @@ class BostonGlobeSubscription(BasicNewsRecipe):
get_top_stories() get_top_stories()
get_section("Nation", "tp-section-thenation") for div in soup.findAll('div', {'class':'tod-paper-section'}):
get_section("World", "tp-section-theworld") get_section(div)
get_section("Metro", "tp-section-metro")
getOpinionSection()
get_section("Arts & Movies", "tp-section-g:arts&movies")
get_section("Family", "tp-section-g:family")
get_section("Style", "tp-section-g:style")
get_section("Globe West", "tp-section-globewest")
get_section("Food", "tp-section-g:food")
get_section("Living", "tp-section-living")
get_section("Health", "tp-section-g:health")
get_section("Ideas", "tp-section-ideas")
get_section("Boston Globe Magazine", "tp-section-magazine")
get_comics() get_comics()
# get_section("Business", "tp-section-business")
# get_section("Obituaries", "tp-section-obituaries")
# get_section("Sports", "tp-section-sports")
# get_section("Globe North", "tp-section-globenorth")
# get_section("Globe South", "tp-section-globesouth")
# get_section("Money & Careers", "tp-section-money&careers")
# get_section("Books", "tp-section-books")
# get_section("Travel", "tp-section-travel")
# get_section("Real Estate", "tp-section-realestate")
for feed in feeds:
feedTitle = feed[0]
articles = feed[1]
self.log(feedTitle)
for article in articles:
self.log(" ", article["title"], ".....", article["url"])
self.log(" ", article["description"])
return feeds return feeds
def postprocess_comics(self, soup, first): def postprocess_comics(self, soup, first):
@ -370,11 +248,6 @@ class BostonGlobeSubscription(BasicNewsRecipe):
imgLink = main.find("a", "comic") imgLink = main.find("a", "comic")
img = imgLink.img img = imgLink.img
print "title: %s" % title
print "byline: %s" % byline
print "imgLink: %s" % imgLink
print "img: %s" % img
body = Tag(soup, "body") body = Tag(soup, "body")
body.insert(0, title) body.insert(0, title)
body.insert(1, byline) body.insert(1, byline)