Update Boston Globe

This commit is contained in:
Kovid Goyal 2017-12-14 14:31:46 +05:30
parent e62a0769c5
commit 9d06256df1
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -21,10 +21,10 @@ class BostonGlobeSubscription(BasicNewsRecipe):
timefmt = ' [%a, %d %b, %Y]' timefmt = ' [%a, %d %b, %Y]'
needs_subscription = 'optional' needs_subscription = 'optional'
keep_only_tags = [ keep_only_tags = [
classes('main-hed lead-figure byline article-text comic'), classes('comic article__title methode__story article-header__headline lead-media figure article-header__byline article-content'),
] ]
remove_tags = [ remove_tags = [
classes('inline-newsletter ad skip-nav'), classes('inline-newsletter ad skip-nav article-footer'),
dict(name=['meta', 'link']) dict(name=['meta', 'link'])
] ]
remove_attributes = ['style'] remove_attributes = ['style']
@ -66,13 +66,13 @@ class BostonGlobeSubscription(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
def is_login_form(form): def is_login_form(form):
return form.action == "/eom/SysConfig/WebPortal/BostonGlobe/Framework/regi/login2.jsp" return form.action == "https://www.bostonglobe.com/eom/SysConfig/WebPortal/BostonGlobe/Framework/regi/login2.jsp"
# br.set_debug_http(True) # br.set_debug_http(True)
# br.set_debug_responses(True) # br.set_debug_responses(True)
# br.set_debug_redirects(True) # br.set_debug_redirects(True)
br.open( br.open(
"https://www.bostonglobe.com/eom/SysConfig/WebPortal/BostonGlobe/Framework/regi/final-login.jsp") "https://www.bostonglobe.com/login?p1=BGHeader_LogIn")
br.select_form(predicate=is_login_form) br.select_form(predicate=is_login_form)
br["username"] = self.username br["username"] = self.username
br["password"] = self.password br["password"] = self.password
@ -103,10 +103,12 @@ class BostonGlobeSubscription(BasicNewsRecipe):
self.log("Getting Top Stories") self.log("Getting Top Stories")
articles = [] articles = []
topStoriesDiv = soup.find("div", {"class": "stories-top"}) topStoriesDiv = soup.find("div", {"class": "stories-top"})
stories = topStoriesDiv.findAll("div", {"class": "story"}) stories = topStoriesDiv.findAll("div", {"class": lambda x: x and 'story' in x.split()})
for story in stories: for story in stories:
h2 = story.find("h2", {"class": 'story-title'}) h2 = story.find("h2", {"class": 'story-title'})
link = story.find("a") link = story.find("a", {'class': 'story-perm'})
for img in h2.findAll('img'):
img.extract()
if h2 is not None and link is not None: if h2 is not None and link is not None:
title = self.tag_to_string(h2) title = self.tag_to_string(h2)
url = self.absolutize_url(link["href"]) url = self.absolutize_url(link["href"])