mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Improved recipe for Wall Street Journal. Fixes #1048 (no longer can download Wall Street Journal)
This commit is contained in:
parent
ce468602a0
commit
30fc423489
@ -138,9 +138,16 @@ def get_proxies():
|
|||||||
return proxies
|
return proxies
|
||||||
|
|
||||||
|
|
||||||
def browser(honor_time=False):
|
def browser(honor_time=True, max_time=2):
|
||||||
|
'''
|
||||||
|
Create a mechanize browser for web scraping. The browser handles cookies,
|
||||||
|
refresh requests and ignores robots.txt. Also uses proxy if avaialable.
|
||||||
|
|
||||||
|
:param honor_time: If True honors pause time in refresh requests
|
||||||
|
:param max_time: Maximum time in seconds to wait during a refresh request
|
||||||
|
'''
|
||||||
opener = mechanize.Browser()
|
opener = mechanize.Browser()
|
||||||
opener.set_handle_refresh(True, honor_time=honor_time)
|
opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
|
||||||
opener.set_handle_robots(False)
|
opener.set_handle_robots(False)
|
||||||
opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
|
opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
|
||||||
http_proxy = get_proxies().get('http', None)
|
http_proxy = get_proxies().get('http', None)
|
||||||
|
@ -493,25 +493,27 @@ class BasicNewsRecipe(object, LoggingInterface):
|
|||||||
@return: Path to index.html
|
@return: Path to index.html
|
||||||
@rtype: string
|
@rtype: string
|
||||||
'''
|
'''
|
||||||
res = self.build_index()
|
try:
|
||||||
self.cleanup()
|
res = self.build_index()
|
||||||
self.report_progress(1, _('Download finished'))
|
self.report_progress(1, _('Download finished'))
|
||||||
if self.failed_downloads:
|
if self.failed_downloads:
|
||||||
self.log_warning(_('Failed to download the following articles:'))
|
self.log_warning(_('Failed to download the following articles:'))
|
||||||
for feed, article, debug in self.failed_downloads:
|
for feed, article, debug in self.failed_downloads:
|
||||||
self.log_warning(article.title+_(' from ')+feed.title)
|
self.log_warning(article.title+_(' from ')+feed.title)
|
||||||
self.log_debug(article.url)
|
self.log_debug(article.url)
|
||||||
self.log_debug(debug)
|
self.log_debug(debug)
|
||||||
if self.partial_failures:
|
if self.partial_failures:
|
||||||
self.log_warning(_('Failed to download parts of the following articles:'))
|
self.log_warning(_('Failed to download parts of the following articles:'))
|
||||||
for feed, atitle, aurl, debug in self.partial_failures:
|
for feed, atitle, aurl, debug in self.partial_failures:
|
||||||
self.log_warning(atitle + _(' from ') + feed)
|
self.log_warning(atitle + _(' from ') + feed)
|
||||||
self.log_debug(aurl)
|
self.log_debug(aurl)
|
||||||
self.log_warning(_('\tFailed links:'))
|
self.log_warning(_('\tFailed links:'))
|
||||||
for l, tb in debug:
|
for l, tb in debug:
|
||||||
self.log_warning(l)
|
self.log_warning(l)
|
||||||
self.log_debug(tb)
|
self.log_debug(tb)
|
||||||
return res
|
return res
|
||||||
|
finally:
|
||||||
|
self.cleanup()
|
||||||
|
|
||||||
def feeds2index(self, feeds):
|
def feeds2index(self, feeds):
|
||||||
templ = templates.IndexTemplate()
|
templ = templates.IndexTemplate()
|
||||||
|
@ -4,27 +4,25 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
|||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
import re, urlparse
|
|
||||||
|
# http://online.wsj.com/page/us_in_todays_paper.html
|
||||||
|
|
||||||
class WallStreetJournal(BasicNewsRecipe):
|
class WallStreetJournal(BasicNewsRecipe):
|
||||||
|
|
||||||
title = 'The Wall Street Journal'
|
title = 'The Wall Street Journal'
|
||||||
__author__ = 'JTravers'
|
__author__ = 'Kovid Goyal'
|
||||||
description = 'News and current affairs.'
|
description = 'News and current affairs.'
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
max_articles_per_feed = 10
|
max_articles_per_feed = 10
|
||||||
timefmt = ' [%a, %b %d, %Y]'
|
timefmt = ' [%a, %b %d, %Y]'
|
||||||
html2lrf_options = ['--ignore-tables']
|
html2lrf_options = ['--ignore-tables']
|
||||||
|
remove_tags_before = dict(name='h1')
|
||||||
|
remove_tags = [
|
||||||
|
dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", "articleTabs_tab_interactive"]),
|
||||||
|
{'class':['more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]},
|
||||||
|
]
|
||||||
|
remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]
|
||||||
|
|
||||||
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
|
||||||
[
|
|
||||||
## Remove anything before the body of the article.
|
|
||||||
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
|
|
||||||
|
|
||||||
## Remove anything after the end of the article.
|
|
||||||
(r'<!-- article end.*?</body>', lambda match : '</body>'),
|
|
||||||
]
|
|
||||||
]
|
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
@ -36,9 +34,14 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
br.submit()
|
br.submit()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
def print_version(self, url):
|
def get_article_url(self, article):
|
||||||
article = urlparse.urlparse(url).path.rpartition('/')[-1]
|
try:
|
||||||
return 'http://online.wsj.com/article_print/'+article
|
return article.feedburner_origlink.split('?')[0]
|
||||||
|
except AttributeError:
|
||||||
|
return article.link.split('?')[0]
|
||||||
|
|
||||||
|
def cleanup(self):
|
||||||
|
self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')
|
||||||
|
|
||||||
def get_feeds(self):
|
def get_feeds(self):
|
||||||
return [
|
return [
|
||||||
@ -89,7 +92,3 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
('Weekend & Leisure - Sports', 'http://online.wsj.com/xml/rss/3_7204.xml'),
|
('Weekend & Leisure - Sports', 'http://online.wsj.com/xml/rss/3_7204.xml'),
|
||||||
]
|
]
|
||||||
|
|
||||||
## Logout of website
|
|
||||||
## NOT CURRENTLY WORKING
|
|
||||||
# def cleanup(self):
|
|
||||||
# self.browser.open('http://commerce.wsj.com/auth/postlogout')
|
|
||||||
|
@ -118,8 +118,10 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
tag = tag.parent
|
tag = tag.parent
|
||||||
|
|
||||||
if self.remove_tags_after is not None:
|
if self.remove_tags_after is not None:
|
||||||
tag = soup.find(**self.remove_tags_after)
|
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
|
||||||
remove_beyond(tag, 'nextSibling')
|
for spec in rt:
|
||||||
|
tag = soup.find(**spec)
|
||||||
|
remove_beyond(tag, 'nextSibling')
|
||||||
|
|
||||||
if self.remove_tags_before is not None:
|
if self.remove_tags_before is not None:
|
||||||
tag = soup.find(**self.remove_tags_before)
|
tag = soup.find(**self.remove_tags_before)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user