mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Update WSJ
This commit is contained in:
parent
4404b6ff95
commit
7028b7ab18
@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import copy
|
||||
import copy, re
|
||||
|
||||
# http://online.wsj.com/page/us_in_todays_paper.html
|
||||
|
||||
@ -22,6 +22,7 @@ class WallStreetJournal(BasicNewsRecipe):
|
||||
timefmt = ' [%a, %b %d, %Y]'
|
||||
no_stylesheets = True
|
||||
ignore_duplicate_articles = {'url'}
|
||||
remove_attributes = ['style', 'data-scrim']
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
|
||||
@ -29,12 +30,16 @@ class WallStreetJournal(BasicNewsRecipe):
|
||||
dict(name='article', id=['article-contents', 'articleBody']),
|
||||
dict(name='div', id='article_story_body'),
|
||||
dict(name='div', attrs={'class':'snippet-ad-login'}),
|
||||
dict(name='div', attrs={'data-module-name':'resp.module.article.articleBody'}),
|
||||
]
|
||||
remove_tags = [
|
||||
dict(attrs={'class':['insetButton', 'insettipBox']}),
|
||||
dict(attrs={'class':['insetButton', 'insettipBox', 'author-info', 'media-object-video']}),
|
||||
dict(attrs={'class':lambda x: x and 'article_tools' in x.split()}),
|
||||
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
|
||||
]
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<!--\[if lte IE 8\]>.+?<!\[endif\]-->', re.DOTALL), lambda m: ''),
|
||||
(re.compile(r'<!\[if ! lte IE 8\]>.+?<!\[endif\]>', re.DOTALL), lambda m:''),
|
||||
]
|
||||
|
||||
use_javascript_to_login = True
|
||||
|
||||
@ -47,7 +52,7 @@ class WallStreetJournal(BasicNewsRecipe):
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
picdiv = soup.find('img')
|
||||
picdiv = soup.find('img', src=True)
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,picdiv['src'])
|
||||
|
||||
@ -57,6 +62,9 @@ class WallStreetJournal(BasicNewsRecipe):
|
||||
img = div.find('img')
|
||||
if img is not None:
|
||||
img.extract()
|
||||
# Use large images
|
||||
for img in soup.findAll('img', attrs={'data-enlarge':True}):
|
||||
img['src'] = img['data-enlarge']
|
||||
|
||||
return soup
|
||||
|
||||
|
@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import copy
|
||||
import copy, re
|
||||
|
||||
class WallStreetJournal(BasicNewsRecipe):
|
||||
|
||||
@ -20,6 +20,7 @@ class WallStreetJournal(BasicNewsRecipe):
|
||||
timefmt = ' [%a, %b %d, %Y]'
|
||||
no_stylesheets = True
|
||||
ignore_duplicate_articles = {'url'}
|
||||
remove_attributes = ['style', 'data-scrim']
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
|
||||
@ -27,16 +28,20 @@ class WallStreetJournal(BasicNewsRecipe):
|
||||
dict(name='article', id=['article-contents', 'articleBody']),
|
||||
dict(name='div', id='article_story_body'),
|
||||
dict(name='div', attrs={'class':'snippet-ad-login'}),
|
||||
dict(name='div', attrs={'data-module-name':'resp.module.article.articleBody'}),
|
||||
]
|
||||
remove_tags = [
|
||||
dict(attrs={'class':['insetButton', 'insettipBox']}),
|
||||
dict(attrs={'class':['insetButton', 'insettipBox', 'author-info', 'media-object-video']}),
|
||||
dict(attrs={'class':lambda x: x and 'article_tools' in x.split()}),
|
||||
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
|
||||
]
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<!--\[if lte IE 8\]>.+?<!\[endif\]-->', re.DOTALL), lambda m: ''),
|
||||
(re.compile(r'<!\[if ! lte IE 8\]>.+?<!\[endif\]>', re.DOTALL), lambda m:''),
|
||||
]
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
picdiv = soup.find('img')
|
||||
picdiv = soup.find('img', src=True)
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,picdiv['src'])
|
||||
|
||||
@ -46,6 +51,9 @@ class WallStreetJournal(BasicNewsRecipe):
|
||||
img = div.find('img')
|
||||
if img is not None:
|
||||
img.extract()
|
||||
# Use large images
|
||||
for img in soup.findAll('img', attrs={'data-enlarge':True}):
|
||||
img['src'] = img['data-enlarge']
|
||||
|
||||
return soup
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user