mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Update WSJ
This commit is contained in:
parent
4404b6ff95
commit
7028b7ab18
@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
|||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
import copy
|
import copy, re
|
||||||
|
|
||||||
# http://online.wsj.com/page/us_in_todays_paper.html
|
# http://online.wsj.com/page/us_in_todays_paper.html
|
||||||
|
|
||||||
@ -22,6 +22,7 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
timefmt = ' [%a, %b %d, %Y]'
|
timefmt = ' [%a, %b %d, %Y]'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
ignore_duplicate_articles = {'url'}
|
ignore_duplicate_articles = {'url'}
|
||||||
|
remove_attributes = ['style', 'data-scrim']
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
|
dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
|
||||||
@ -29,12 +30,16 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
dict(name='article', id=['article-contents', 'articleBody']),
|
dict(name='article', id=['article-contents', 'articleBody']),
|
||||||
dict(name='div', id='article_story_body'),
|
dict(name='div', id='article_story_body'),
|
||||||
dict(name='div', attrs={'class':'snippet-ad-login'}),
|
dict(name='div', attrs={'class':'snippet-ad-login'}),
|
||||||
dict(name='div', attrs={'data-module-name':'resp.module.article.articleBody'}),
|
|
||||||
]
|
]
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(attrs={'class':['insetButton', 'insettipBox']}),
|
dict(attrs={'class':['insetButton', 'insettipBox', 'author-info', 'media-object-video']}),
|
||||||
|
dict(attrs={'class':lambda x: x and 'article_tools' in x.split()}),
|
||||||
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
|
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
|
||||||
]
|
]
|
||||||
|
preprocess_regexps = [
|
||||||
|
(re.compile(r'<!--\[if lte IE 8\]>.+?<!\[endif\]-->', re.DOTALL), lambda m: ''),
|
||||||
|
(re.compile(r'<!\[if ! lte IE 8\]>.+?<!\[endif\]>', re.DOTALL), lambda m:''),
|
||||||
|
]
|
||||||
|
|
||||||
use_javascript_to_login = True
|
use_javascript_to_login = True
|
||||||
|
|
||||||
@ -47,7 +52,7 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
|
|
||||||
def populate_article_metadata(self, article, soup, first):
|
def populate_article_metadata(self, article, soup, first):
|
||||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||||
picdiv = soup.find('img')
|
picdiv = soup.find('img', src=True)
|
||||||
if picdiv is not None:
|
if picdiv is not None:
|
||||||
self.add_toc_thumbnail(article,picdiv['src'])
|
self.add_toc_thumbnail(article,picdiv['src'])
|
||||||
|
|
||||||
@ -57,6 +62,9 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
img = div.find('img')
|
img = div.find('img')
|
||||||
if img is not None:
|
if img is not None:
|
||||||
img.extract()
|
img.extract()
|
||||||
|
# Use large images
|
||||||
|
for img in soup.findAll('img', attrs={'data-enlarge':True}):
|
||||||
|
img['src'] = img['data-enlarge']
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
|||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
import copy
|
import copy, re
|
||||||
|
|
||||||
class WallStreetJournal(BasicNewsRecipe):
|
class WallStreetJournal(BasicNewsRecipe):
|
||||||
|
|
||||||
@ -20,6 +20,7 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
timefmt = ' [%a, %b %d, %Y]'
|
timefmt = ' [%a, %b %d, %Y]'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
ignore_duplicate_articles = {'url'}
|
ignore_duplicate_articles = {'url'}
|
||||||
|
remove_attributes = ['style', 'data-scrim']
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
|
dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
|
||||||
@ -27,16 +28,20 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
dict(name='article', id=['article-contents', 'articleBody']),
|
dict(name='article', id=['article-contents', 'articleBody']),
|
||||||
dict(name='div', id='article_story_body'),
|
dict(name='div', id='article_story_body'),
|
||||||
dict(name='div', attrs={'class':'snippet-ad-login'}),
|
dict(name='div', attrs={'class':'snippet-ad-login'}),
|
||||||
dict(name='div', attrs={'data-module-name':'resp.module.article.articleBody'}),
|
|
||||||
]
|
]
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(attrs={'class':['insetButton', 'insettipBox']}),
|
dict(attrs={'class':['insetButton', 'insettipBox', 'author-info', 'media-object-video']}),
|
||||||
|
dict(attrs={'class':lambda x: x and 'article_tools' in x.split()}),
|
||||||
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
|
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
|
||||||
]
|
]
|
||||||
|
preprocess_regexps = [
|
||||||
|
(re.compile(r'<!--\[if lte IE 8\]>.+?<!\[endif\]-->', re.DOTALL), lambda m: ''),
|
||||||
|
(re.compile(r'<!\[if ! lte IE 8\]>.+?<!\[endif\]>', re.DOTALL), lambda m:''),
|
||||||
|
]
|
||||||
|
|
||||||
def populate_article_metadata(self, article, soup, first):
|
def populate_article_metadata(self, article, soup, first):
|
||||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||||
picdiv = soup.find('img')
|
picdiv = soup.find('img', src=True)
|
||||||
if picdiv is not None:
|
if picdiv is not None:
|
||||||
self.add_toc_thumbnail(article,picdiv['src'])
|
self.add_toc_thumbnail(article,picdiv['src'])
|
||||||
|
|
||||||
@ -46,6 +51,9 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
img = div.find('img')
|
img = div.find('img')
|
||||||
if img is not None:
|
if img is not None:
|
||||||
img.extract()
|
img.extract()
|
||||||
|
# Use large images
|
||||||
|
for img in soup.findAll('img', attrs={'data-enlarge':True}):
|
||||||
|
img['src'] = img['data-enlarge']
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user