Improved recipe for Wall Street Journal. Fixes #1048 (no longer can download Wall Street Journal)

This commit is contained in:
Kovid Goyal 2008-09-25 13:39:44 -07:00
parent ce468602a0
commit 30fc423489
4 changed files with 53 additions and 43 deletions

View File

@ -138,9 +138,16 @@ def get_proxies():
return proxies return proxies
def browser(honor_time=False): def browser(honor_time=True, max_time=2):
'''
Create a mechanize browser for web scraping. The browser handles cookies,
refresh requests and ignores robots.txt. Also uses proxy if avaialable.
:param honor_time: If True honors pause time in refresh requests
:param max_time: Maximum time in seconds to wait during a refresh request
'''
opener = mechanize.Browser() opener = mechanize.Browser()
opener.set_handle_refresh(True, honor_time=honor_time) opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
opener.set_handle_robots(False) opener.set_handle_robots(False)
opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')] opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
http_proxy = get_proxies().get('http', None) http_proxy = get_proxies().get('http', None)

View File

@ -493,25 +493,27 @@ class BasicNewsRecipe(object, LoggingInterface):
@return: Path to index.html @return: Path to index.html
@rtype: string @rtype: string
''' '''
res = self.build_index() try:
self.cleanup() res = self.build_index()
self.report_progress(1, _('Download finished')) self.report_progress(1, _('Download finished'))
if self.failed_downloads: if self.failed_downloads:
self.log_warning(_('Failed to download the following articles:')) self.log_warning(_('Failed to download the following articles:'))
for feed, article, debug in self.failed_downloads: for feed, article, debug in self.failed_downloads:
self.log_warning(article.title+_(' from ')+feed.title) self.log_warning(article.title+_(' from ')+feed.title)
self.log_debug(article.url) self.log_debug(article.url)
self.log_debug(debug) self.log_debug(debug)
if self.partial_failures: if self.partial_failures:
self.log_warning(_('Failed to download parts of the following articles:')) self.log_warning(_('Failed to download parts of the following articles:'))
for feed, atitle, aurl, debug in self.partial_failures: for feed, atitle, aurl, debug in self.partial_failures:
self.log_warning(atitle + _(' from ') + feed) self.log_warning(atitle + _(' from ') + feed)
self.log_debug(aurl) self.log_debug(aurl)
self.log_warning(_('\tFailed links:')) self.log_warning(_('\tFailed links:'))
for l, tb in debug: for l, tb in debug:
self.log_warning(l) self.log_warning(l)
self.log_debug(tb) self.log_debug(tb)
return res return res
finally:
self.cleanup()
def feeds2index(self, feeds): def feeds2index(self, feeds):
templ = templates.IndexTemplate() templ = templates.IndexTemplate()

View File

@ -4,27 +4,25 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re, urlparse
# http://online.wsj.com/page/us_in_todays_paper.html
class WallStreetJournal(BasicNewsRecipe): class WallStreetJournal(BasicNewsRecipe):
title = 'The Wall Street Journal' title = 'The Wall Street Journal'
__author__ = 'JTravers' __author__ = 'Kovid Goyal'
description = 'News and current affairs.' description = 'News and current affairs.'
needs_subscription = True needs_subscription = True
max_articles_per_feed = 10 max_articles_per_feed = 10
timefmt = ' [%a, %b %d, %Y]' timefmt = ' [%a, %b %d, %Y]'
html2lrf_options = ['--ignore-tables'] html2lrf_options = ['--ignore-tables']
remove_tags_before = dict(name='h1')
remove_tags = [
dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", "articleTabs_tab_interactive"]),
{'class':['more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]},
]
remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
## Remove anything before the body of the article.
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
## Remove anything after the end of the article.
(r'<!-- article end.*?</body>', lambda match : '</body>'),
]
]
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
@ -36,9 +34,14 @@ class WallStreetJournal(BasicNewsRecipe):
br.submit() br.submit()
return br return br
def print_version(self, url): def get_article_url(self, article):
article = urlparse.urlparse(url).path.rpartition('/')[-1] try:
return 'http://online.wsj.com/article_print/'+article return article.feedburner_origlink.split('?')[0]
except AttributeError:
return article.link.split('?')[0]
def cleanup(self):
self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')
def get_feeds(self): def get_feeds(self):
return [ return [
@ -89,7 +92,3 @@ class WallStreetJournal(BasicNewsRecipe):
('Weekend & Leisure - Sports', 'http://online.wsj.com/xml/rss/3_7204.xml'), ('Weekend & Leisure - Sports', 'http://online.wsj.com/xml/rss/3_7204.xml'),
] ]
## Logout of website
## NOT CURRENTLY WORKING
# def cleanup(self):
# self.browser.open('http://commerce.wsj.com/auth/postlogout')

View File

@ -118,8 +118,10 @@ class RecursiveFetcher(object, LoggingInterface):
tag = tag.parent tag = tag.parent
if self.remove_tags_after is not None: if self.remove_tags_after is not None:
tag = soup.find(**self.remove_tags_after) rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
remove_beyond(tag, 'nextSibling') for spec in rt:
tag = soup.find(**spec)
remove_beyond(tag, 'nextSibling')
if self.remove_tags_before is not None: if self.remove_tags_before is not None:
tag = soup.find(**self.remove_tags_before) tag = soup.find(**self.remove_tags_before)