mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Boston Globe
This commit is contained in:
parent
25de618ae1
commit
e0dfef216f
@ -1,7 +1,7 @@
|
|||||||
import string, re
|
import string, re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Tag
|
from calibre.ebooks.BeautifulSoup import Tag
|
||||||
from datetime import date
|
from datetime import date, timedelta
|
||||||
from calibre.utils.magick.draw import save_cover_data_to
|
from calibre.utils.magick.draw import save_cover_data_to
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
|
||||||
@ -14,21 +14,19 @@ class BostonGlobeSubscription(BasicNewsRecipe):
|
|||||||
__author__ = 'Rob Freundlich'
|
__author__ = 'Rob Freundlich'
|
||||||
description = 'Boston Globe with full articles for subscribers'
|
description = 'Boston Globe with full articles for subscribers'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
INDEX = date.today().strftime('http://www.bostonglobe.com/todayspaper/%Y/%m/%d')
|
INDEX = 'http://www.bostonglobe.com/todayspaper/%Y/%m/%d'
|
||||||
todaysDate = date.today().strftime("%d/%m/%Y")
|
todaysDate = date.today().strftime("%d/%m/%Y")
|
||||||
timefmt = ' [%a, %d %b, %Y]'
|
timefmt = ' [%a, %d %b, %Y]'
|
||||||
needs_subscription = 'optional'
|
needs_subscription = 'optional'
|
||||||
remove_tags = [dict(attrs={"class":["skip-nav article-more",
|
keep_only_tags = [
|
||||||
"aside promo",
|
dict(attrs={'class':['section-head', 'comic', 'article']})
|
||||||
"article-bar bar-yellow",
|
]
|
||||||
"tools",
|
remove_tags = [
|
||||||
"sticky-tools",
|
dict(attrs={"class":[
|
||||||
"article-footer",
|
"skip-nav article-more", "aside promo", "article-bar bar-yellow", "tools", "sticky-tools", "article-footer", "bg-footer"
|
||||||
"bg-footer"]}),
|
]}),
|
||||||
dict(attrs={"id":["masthead",
|
dict(attrs={"id":["masthead", "video", "section-nav", 'newsletter-form', "meter-limit-met-popup"]})
|
||||||
"video",
|
]
|
||||||
"section-nav",
|
|
||||||
"meter-limit-met-popup"]})]
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
# simultaneous_downloads = 1
|
# simultaneous_downloads = 1
|
||||||
valid_filename_chars = "-_.%s%s" % (string.ascii_letters, string.digits)
|
valid_filename_chars = "-_.%s%s" % (string.ascii_letters, string.digits)
|
||||||
@ -126,8 +124,9 @@ class BostonGlobeSubscription(BasicNewsRecipe):
|
|||||||
def make_url(self, url):
|
def make_url(self, url):
|
||||||
if url.startswith("//"):
|
if url.startswith("//"):
|
||||||
return "http:" + url
|
return "http:" + url
|
||||||
|
if url.startswith('/'):
|
||||||
return "http://www.bostonglobe.com" + url
|
url = "http://www.bostonglobe.com" + url
|
||||||
|
return url
|
||||||
|
|
||||||
def make_bostoncom_url(self, url):
|
def make_bostoncom_url(self, url):
|
||||||
if url.startswith("//"):
|
if url.startswith("//"):
|
||||||
@ -138,173 +137,81 @@ class BostonGlobeSubscription(BasicNewsRecipe):
|
|||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
# self.logger.setLevel(logging.WARNING)
|
# self.logger.setLevel(logging.WARNING)
|
||||||
feeds = []
|
feeds = []
|
||||||
self.log("Getting today's paper from ", self.INDEX)
|
try:
|
||||||
soup = self.index_to_soup(self.INDEX)
|
index = date.today().strftime(self.INDEX)
|
||||||
|
self.log("Getting today's paper from ", index)
|
||||||
|
soup = self.index_to_soup(index)
|
||||||
|
except Exception:
|
||||||
|
self.todaysDate = (date.today() - timedelta(days=1))
|
||||||
|
index = self.todaysDate.strftime(self.INDEX)
|
||||||
|
self.log("Getting today's paper from ", index)
|
||||||
|
soup = self.index_to_soup(index)
|
||||||
|
|
||||||
|
def title_from_h2(h2):
|
||||||
|
[img.extract() for img in h2.findAll('img')]
|
||||||
|
return self.tag_to_string(h2)
|
||||||
|
|
||||||
def get_top_stories():
|
def get_top_stories():
|
||||||
self.log("Getting top stories")
|
self.log("Getting Top Stories")
|
||||||
articles = []
|
articles = []
|
||||||
topStoriesDiv = soup.find("div", {"class":re.compile(".*stories-top.*")})
|
topStoriesDiv = soup.find("div", {"class":"stories-top"})
|
||||||
|
stories = topStoriesDiv.findAll("div", {"class":"story"})
|
||||||
title = ""
|
|
||||||
url = ""
|
|
||||||
excerpt = ""
|
|
||||||
|
|
||||||
stories = topStoriesDiv.findAll("div", {"class":re.compile("story.*")})
|
|
||||||
for story in stories:
|
for story in stories:
|
||||||
title = self.tag_to_string(story.find("h2", {"class":re.compile(".*story-title.*")}))
|
h2 = story.find("h2", {"class":'story-title'})
|
||||||
link = story.find("a")
|
link = story.find("a")
|
||||||
if (link):
|
if h2 is not None and link is not None:
|
||||||
url = link["href"]
|
title = title_from_h2(h2)
|
||||||
|
url = self.make_url(link["href"])
|
||||||
excerpt_div = story.find("div", {"class":"excerpt"})
|
excerpt_div = story.find("div", {"class":"excerpt"})
|
||||||
excerpt = self.tag_to_string(excerpt_div)
|
excerpt = self.tag_to_string(excerpt_div)
|
||||||
|
self.log('\t', title, '[%s]' % url)
|
||||||
|
self.log('\t\t', excerpt)
|
||||||
articles.append({"title":title, "url":self.make_url(url), "date":self.todaysDate, "description":excerpt})
|
articles.append({"title":title, "url":self.make_url(url), "date":self.todaysDate, "description":excerpt})
|
||||||
else:
|
|
||||||
self.log("Skipping ", title, " because it has no link")
|
|
||||||
|
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append(("Top Stories", articles))
|
feeds.append(("Top Stories", articles))
|
||||||
|
|
||||||
def get_section(sectionTitle, sectionID):
|
def get_section(sectionDiv):
|
||||||
self.log("Getting section", sectionTitle)
|
sectionHeader = sectionDiv.find("h2", "hed-section")
|
||||||
articles = []
|
articles = []
|
||||||
sectionDiv = soup.find(id=sectionID)
|
feedTitle = self.tag_to_string(sectionHeader)
|
||||||
if (sectionDiv):
|
self.log("Getting", feedTitle)
|
||||||
sectionHeader = sectionDiv.find("h2", "hed-section")
|
excerpts = sectionDiv.findAll("div", "sec-excerpt")
|
||||||
feedTitle = self.tag_to_string(sectionHeader)
|
for excerpt in excerpts:
|
||||||
excerpts = sectionDiv.findAll("div", "sec-excerpt")
|
# Stories here follow similar forms to top-stories (above)
|
||||||
for excerpt in excerpts:
|
storyTitle = excerpt.find("h3", "story-title")
|
||||||
url = ""
|
if (storyTitle.parent.name == "a"):
|
||||||
title = ""
|
a = storyTitle.parent
|
||||||
category = ""
|
url = a["href"]
|
||||||
author = ""
|
title = title_from_h2(storyTitle)
|
||||||
|
else:
|
||||||
|
a = storyTitle.find("a")
|
||||||
|
url = a["href"]
|
||||||
|
title = title_from_h2(a)
|
||||||
|
|
||||||
# Stories here follow similar forms to top-stories (above)
|
hedCat = excerpt.find("p", "hed-cat")
|
||||||
storyTitle = excerpt.find("h3", "story-title")
|
if (hedCat):
|
||||||
if (storyTitle.parent.name == "a"):
|
category = self.tag_to_string(hedCat)
|
||||||
a = storyTitle.parent
|
|
||||||
url = a["href"]
|
|
||||||
title = self.tag_to_string(storyTitle)
|
|
||||||
else:
|
|
||||||
a = storyTitle.find("a")
|
|
||||||
url = a["href"]
|
|
||||||
title = self.tag_to_string(a)
|
|
||||||
|
|
||||||
hedCat = excerpt.find("p", "hed-cat")
|
authorHeader = excerpt.find("h4", "author")
|
||||||
if (hedCat):
|
if (authorHeader):
|
||||||
category = self.tag_to_string(hedCat)
|
author = self.tag_to_string(authorHeader)
|
||||||
|
|
||||||
authorHeader = excerpt.find("h4", "author")
|
if (category != "") & (category != " "):
|
||||||
if (authorHeader):
|
title = category + ": " + title
|
||||||
author = self.tag_to_string(authorHeader)
|
|
||||||
|
|
||||||
if (category != "") & (category != " "):
|
|
||||||
title = category + ": " + title
|
|
||||||
|
|
||||||
description = ""
|
|
||||||
for para in excerpt.findAll("p"):
|
|
||||||
if (para != hedCat):
|
|
||||||
description += self.tag_to_string(para)
|
|
||||||
|
|
||||||
articles.append({"title":title, "url":self.make_url(url), "author":author, "date":self.todaysDate, "description":description})
|
|
||||||
|
|
||||||
if articles:
|
|
||||||
feeds.append((feedTitle, articles))
|
|
||||||
|
|
||||||
def getOpinionSection():
|
|
||||||
self.log("Getting section", "Editorials and Opinions")
|
|
||||||
articles = []
|
|
||||||
opinionSoup = self.index_to_soup("http://www.bostonglobe.com/opinion")
|
|
||||||
|
|
||||||
# Find and process the editorials
|
|
||||||
topStories = opinionSoup.find("div", "stories-top")
|
|
||||||
for story in topStories.findAll("div", {"class":re.compile("story.*")}):
|
|
||||||
# Story title is always in an H2 tag whose class contains "story-title"
|
|
||||||
titleTag = story.find("h2", {"class":re.compile("story-title.*")})
|
|
||||||
|
|
||||||
# Description is always in a DIV whose class is "excert"
|
|
||||||
excerptTag = story.find("div", "excerpt")
|
|
||||||
|
|
||||||
# Author is in a P whose class is "hed-cat" or in a CITE
|
|
||||||
authorTag = story.find("p", "hed-cat")
|
|
||||||
if (authorTag is None):
|
|
||||||
authorTag = story.find("cite")
|
|
||||||
|
|
||||||
# URL is in an A whose class is "story-perm". If not, it's in the only A tag
|
|
||||||
urlTag = story.find("a", "story-perm")
|
|
||||||
if (urlTag is None):
|
|
||||||
urlTag = story.find("a")
|
|
||||||
|
|
||||||
# Extract the values and build the article
|
|
||||||
title = ""
|
|
||||||
if (titleTag):
|
|
||||||
title = self.tag_to_string(titleTag)
|
|
||||||
|
|
||||||
author = ""
|
|
||||||
if (authorTag):
|
|
||||||
author = self.tag_to_string(authorTag)
|
|
||||||
|
|
||||||
description = ""
|
description = ""
|
||||||
if (excerptTag):
|
for para in excerpt.findAll("p"):
|
||||||
description = self.tag_to_string(excerptTag)
|
if (para != hedCat):
|
||||||
|
description += self.tag_to_string(para)
|
||||||
|
|
||||||
url = ""
|
self.log('\t', title, '[%s]' % self.make_url(url))
|
||||||
if (urlTag):
|
if description:
|
||||||
url = urlTag["href"]
|
self.log('\t\t', description)
|
||||||
articles.append({"title":title, "url":self.make_url(url), "author":author, "date":self.todaysDate, "description":description})
|
articles.append({"title":title, "url":self.make_url(url), "author":author, "date":self.todaysDate, "description":description})
|
||||||
|
|
||||||
# Now find Letters to the Editor and process them
|
|
||||||
mainDiv = opinionSoup.find("div", {"id":"main"})
|
|
||||||
if (mainDiv is None):
|
|
||||||
print "no mainDiv found"
|
|
||||||
else:
|
|
||||||
lettersAnchor = mainDiv.find("a", {"href":re.compile(".*opinion/letters.*")})
|
|
||||||
if (lettersAnchor is None):
|
|
||||||
print "No lettersAnchor found"
|
|
||||||
else:
|
|
||||||
lettersFeatureWell = lettersAnchor.parent.parent
|
|
||||||
lettersContent = lettersFeatureWell.find("div", "content")
|
|
||||||
if (lettersContent is None):
|
|
||||||
print "No lettersContent found"
|
|
||||||
else:
|
|
||||||
mainLetterDiv = lettersContent.find("div", {"class":re.compile("main.*")})
|
|
||||||
mainLetterStories = mainLetterDiv.findAll("h4", "story-title")
|
|
||||||
if (mainLetterStories is None):
|
|
||||||
print "no mainLetterStories found"
|
|
||||||
else:
|
|
||||||
for mainLetterStory in mainLetterStories:
|
|
||||||
mainLetterAnchor = mainLetterStory.parent
|
|
||||||
if (mainLetterAnchor is None):
|
|
||||||
print "no mainLetterAnchor found"
|
|
||||||
else:
|
|
||||||
articles.append({"title":self.tag_to_string(mainLetterStory), "url":self.make_url(
|
|
||||||
mainLetterAnchor["href"]), "author":"Letter", "date":self.todaysDate, "description":""})
|
|
||||||
inSection = lettersFeatureWell.find("div", "in-section")
|
|
||||||
if (inSection is None):
|
|
||||||
print "no inSection found"
|
|
||||||
else:
|
|
||||||
lettersList = inSection.find("ul")
|
|
||||||
if (lettersList is None):
|
|
||||||
print "no lettersList found"
|
|
||||||
else:
|
|
||||||
for letter in lettersList.findAll("li"):
|
|
||||||
letterAnchor = letter.find("a")
|
|
||||||
if (letterAnchor is None):
|
|
||||||
print "no letterAnchor for ", letter
|
|
||||||
else:
|
|
||||||
articles.append({"title":self.tag_to_string(letterAnchor), "url":self.make_url(
|
|
||||||
letterAnchor["href"]), "author":"Letter", "date":self.todaysDate, "description":""})
|
|
||||||
|
|
||||||
if articles:
|
if articles:
|
||||||
# for article in articles:
|
feeds.append((feedTitle, articles))
|
||||||
# print "============"
|
|
||||||
# print "TITLE", article["title"]
|
|
||||||
# print "URL", article["url"]
|
|
||||||
# print "AUTHOR", article["author"]
|
|
||||||
# print "DATE", article["date"]
|
|
||||||
# print "DESCRIPTION", article["description"]
|
|
||||||
# print "============"
|
|
||||||
feeds.append(("Editorial and Opinion", articles))
|
|
||||||
|
|
||||||
def get_comics():
|
def get_comics():
|
||||||
articles = []
|
articles = []
|
||||||
@ -326,40 +233,11 @@ class BostonGlobeSubscription(BasicNewsRecipe):
|
|||||||
|
|
||||||
get_top_stories()
|
get_top_stories()
|
||||||
|
|
||||||
get_section("Nation", "tp-section-thenation")
|
for div in soup.findAll('div', {'class':'tod-paper-section'}):
|
||||||
get_section("World", "tp-section-theworld")
|
get_section(div)
|
||||||
get_section("Metro", "tp-section-metro")
|
|
||||||
getOpinionSection()
|
|
||||||
get_section("Arts & Movies", "tp-section-g:arts&movies")
|
|
||||||
get_section("Family", "tp-section-g:family")
|
|
||||||
get_section("Style", "tp-section-g:style")
|
|
||||||
get_section("Globe West", "tp-section-globewest")
|
|
||||||
get_section("Food", "tp-section-g:food")
|
|
||||||
get_section("Living", "tp-section-living")
|
|
||||||
get_section("Health", "tp-section-g:health")
|
|
||||||
get_section("Ideas", "tp-section-ideas")
|
|
||||||
get_section("Boston Globe Magazine", "tp-section-magazine")
|
|
||||||
|
|
||||||
get_comics()
|
get_comics()
|
||||||
|
|
||||||
# get_section("Business", "tp-section-business")
|
|
||||||
# get_section("Obituaries", "tp-section-obituaries")
|
|
||||||
# get_section("Sports", "tp-section-sports")
|
|
||||||
# get_section("Globe North", "tp-section-globenorth")
|
|
||||||
# get_section("Globe South", "tp-section-globesouth")
|
|
||||||
# get_section("Money & Careers", "tp-section-money&careers")
|
|
||||||
# get_section("Books", "tp-section-books")
|
|
||||||
# get_section("Travel", "tp-section-travel")
|
|
||||||
# get_section("Real Estate", "tp-section-realestate")
|
|
||||||
|
|
||||||
for feed in feeds:
|
|
||||||
feedTitle = feed[0]
|
|
||||||
articles = feed[1]
|
|
||||||
self.log(feedTitle)
|
|
||||||
for article in articles:
|
|
||||||
self.log(" ", article["title"], ".....", article["url"])
|
|
||||||
self.log(" ", article["description"])
|
|
||||||
|
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def postprocess_comics(self, soup, first):
|
def postprocess_comics(self, soup, first):
|
||||||
@ -370,11 +248,6 @@ class BostonGlobeSubscription(BasicNewsRecipe):
|
|||||||
imgLink = main.find("a", "comic")
|
imgLink = main.find("a", "comic")
|
||||||
img = imgLink.img
|
img = imgLink.img
|
||||||
|
|
||||||
print "title: %s" % title
|
|
||||||
print "byline: %s" % byline
|
|
||||||
print "imgLink: %s" % imgLink
|
|
||||||
print "img: %s" % img
|
|
||||||
|
|
||||||
body = Tag(soup, "body")
|
body = Tag(soup, "body")
|
||||||
body.insert(0, title)
|
body.insert(0, title)
|
||||||
body.insert(1, byline)
|
body.insert(1, byline)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user