Fix the Wall Street Journal profile

This commit is contained in:
Kovid Goyal 2008-01-29 03:49:28 +00:00
parent f823c2b988
commit dbc4bc0e5b

View File

@ -7,6 +7,7 @@
''' '''
import re import re
from urlparse import urlparse
from libprs500.ebooks.lrf.web.profiles import DefaultProfile from libprs500.ebooks.lrf.web.profiles import DefaultProfile
@ -15,9 +16,10 @@ class WallStreetJournal(DefaultProfile):
title = 'Wall Street Journal' title = 'Wall Street Journal'
max_recursions = 2 max_recursions = 2
needs_subscription = True needs_subscription = True
no_stylesheets = False
max_articles_per_feed = 10 max_articles_per_feed = 10
timefmt = ' [%a, %b %d, %Y]' timefmt = ' [%a, %b %d, %Y]'
html2lrf_options = ['--ignore-tables', '--base-font-size=5'] html2lrf_options = ['--ignore-tables']
## Don't grab articles more than 7 days old ## Don't grab articles more than 7 days old
oldest_article = 7 oldest_article = 7
@ -27,9 +29,6 @@ class WallStreetJournal(DefaultProfile):
## Remove anything before the body of the article. ## Remove anything before the body of the article.
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'), (r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
## Remove any insets from the body of the article.
(r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
## Remove anything after the end of the article. ## Remove anything after the end of the article.
(r'<!-- article end.*?</body>', lambda match : '</body>'), (r'<!-- article end.*?</body>', lambda match : '</body>'),
] ]
@ -46,7 +45,8 @@ class WallStreetJournal(DefaultProfile):
return br return br
def print_version(self, url): def print_version(self, url):
return url.replace('/article/', '/article_print/') article = urlparse(url).path.rpartition('/')[-1]
return 'http://online.wsj.com/article_print/'+article
## Comment out the feeds you don't want retrieved. ## Comment out the feeds you don't want retrieved.
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them or use spaces to put them in the order you desire ## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them or use spaces to put them in the order you desire