diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 152c58502f..afc74fbdef 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -138,9 +138,16 @@ def get_proxies(): return proxies -def browser(honor_time=False): +def browser(honor_time=True, max_time=2): + ''' + Create a mechanize browser for web scraping. The browser handles cookies, + refresh requests and ignores robots.txt. Also uses proxy if avaialable. + + :param honor_time: If True honors pause time in refresh requests + :param max_time: Maximum time in seconds to wait during a refresh request + ''' opener = mechanize.Browser() - opener.set_handle_refresh(True, honor_time=honor_time) + opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time) opener.set_handle_robots(False) opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')] http_proxy = get_proxies().get('http', None) diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 32b7ee2562..bac117d628 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -493,25 +493,27 @@ class BasicNewsRecipe(object, LoggingInterface): @return: Path to index.html @rtype: string ''' - res = self.build_index() - self.cleanup() - self.report_progress(1, _('Download finished')) - if self.failed_downloads: - self.log_warning(_('Failed to download the following articles:')) - for feed, article, debug in self.failed_downloads: - self.log_warning(article.title+_(' from ')+feed.title) - self.log_debug(article.url) - self.log_debug(debug) - if self.partial_failures: - self.log_warning(_('Failed to download parts of the following articles:')) - for feed, atitle, aurl, debug in self.partial_failures: - self.log_warning(atitle + _(' from ') + feed) - self.log_debug(aurl) - self.log_warning(_('\tFailed links:')) - for l, tb in debug: - self.log_warning(l) - self.log_debug(tb) - return res + try: + res = self.build_index() + self.report_progress(1, _('Download finished')) + if self.failed_downloads: + self.log_warning(_('Failed to download the following articles:')) + for feed, article, debug in self.failed_downloads: + self.log_warning(article.title+_(' from ')+feed.title) + self.log_debug(article.url) + self.log_debug(debug) + if self.partial_failures: + self.log_warning(_('Failed to download parts of the following articles:')) + for feed, atitle, aurl, debug in self.partial_failures: + self.log_warning(atitle + _(' from ') + feed) + self.log_debug(aurl) + self.log_warning(_('\tFailed links:')) + for l, tb in debug: + self.log_warning(l) + self.log_debug(tb) + return res + finally: + self.cleanup() def feeds2index(self, feeds): templ = templates.IndexTemplate() diff --git a/src/calibre/web/feeds/recipes/wsj.py b/src/calibre/web/feeds/recipes/wsj.py index b6ab4f6f1a..ec3bc6bb93 100644 --- a/src/calibre/web/feeds/recipes/wsj.py +++ b/src/calibre/web/feeds/recipes/wsj.py @@ -4,28 +4,26 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' from calibre.web.feeds.news import BasicNewsRecipe -import re, urlparse + +# http://online.wsj.com/page/us_in_todays_paper.html class WallStreetJournal(BasicNewsRecipe): title = 'The Wall Street Journal' - __author__ = 'JTravers' + __author__ = 'Kovid Goyal' description = 'News and current affairs.' needs_subscription = True max_articles_per_feed = 10 timefmt = ' [%a, %b %d, %Y]' html2lrf_options = ['--ignore-tables'] + remove_tags_before = dict(name='h1') + remove_tags = [ + dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", "articleTabs_tab_interactive"]), + {'class':['more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]}, + ] + remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},] - preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in - [ - ## Remove anything before the body of the article. - (r'