Update WSJ

This commit is contained in:
Kovid Goyal 2014-10-09 10:02:43 +05:30
parent 4404b6ff95
commit 7028b7ab18
2 changed files with 24 additions and 8 deletions

View File

@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe
import copy
import copy, re
# http://online.wsj.com/page/us_in_todays_paper.html
@ -22,6 +22,7 @@ class WallStreetJournal(BasicNewsRecipe):
timefmt = ' [%a, %b %d, %Y]'
no_stylesheets = True
ignore_duplicate_articles = {'url'}
remove_attributes = ['style', 'data-scrim']
keep_only_tags = [
dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
@ -29,12 +30,16 @@ class WallStreetJournal(BasicNewsRecipe):
dict(name='article', id=['article-contents', 'articleBody']),
dict(name='div', id='article_story_body'),
dict(name='div', attrs={'class':'snippet-ad-login'}),
dict(name='div', attrs={'data-module-name':'resp.module.article.articleBody'}),
]
remove_tags = [
dict(attrs={'class':['insetButton', 'insettipBox']}),
dict(attrs={'class':['insetButton', 'insettipBox', 'author-info', 'media-object-video']}),
dict(attrs={'class':lambda x: x and 'article_tools' in x.split()}),
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
]
preprocess_regexps = [
(re.compile(r'<!--\[if lte IE 8\]>.+?<!\[endif\]-->', re.DOTALL), lambda m: ''),
(re.compile(r'<!\[if ! lte IE 8\]>.+?<!\[endif\]>', re.DOTALL), lambda m:''),
]
use_javascript_to_login = True
@ -47,7 +52,7 @@ class WallStreetJournal(BasicNewsRecipe):
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
picdiv = soup.find('img', src=True)
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
@ -57,6 +62,9 @@ class WallStreetJournal(BasicNewsRecipe):
img = div.find('img')
if img is not None:
img.extract()
# Use large images
for img in soup.findAll('img', attrs={'data-enlarge':True}):
img['src'] = img['data-enlarge']
return soup

View File

@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe
import copy
import copy, re
class WallStreetJournal(BasicNewsRecipe):
@ -20,6 +20,7 @@ class WallStreetJournal(BasicNewsRecipe):
timefmt = ' [%a, %b %d, %Y]'
no_stylesheets = True
ignore_duplicate_articles = {'url'}
remove_attributes = ['style', 'data-scrim']
keep_only_tags = [
dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
@ -27,16 +28,20 @@ class WallStreetJournal(BasicNewsRecipe):
dict(name='article', id=['article-contents', 'articleBody']),
dict(name='div', id='article_story_body'),
dict(name='div', attrs={'class':'snippet-ad-login'}),
dict(name='div', attrs={'data-module-name':'resp.module.article.articleBody'}),
]
remove_tags = [
dict(attrs={'class':['insetButton', 'insettipBox']}),
dict(attrs={'class':['insetButton', 'insettipBox', 'author-info', 'media-object-video']}),
dict(attrs={'class':lambda x: x and 'article_tools' in x.split()}),
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
]
preprocess_regexps = [
(re.compile(r'<!--\[if lte IE 8\]>.+?<!\[endif\]-->', re.DOTALL), lambda m: ''),
(re.compile(r'<!\[if ! lte IE 8\]>.+?<!\[endif\]>', re.DOTALL), lambda m:''),
]
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
picdiv = soup.find('img', src=True)
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
@ -46,6 +51,9 @@ class WallStreetJournal(BasicNewsRecipe):
img = div.find('img')
if img is not None:
img.extract()
# Use large images
for img in soup.findAll('img', attrs={'data-enlarge':True}):
img['src'] = img['data-enlarge']
return soup