mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
...
This commit is contained in:
parent
d83a9104fd
commit
b3282b3ac5
@ -8,7 +8,7 @@ online.wsj.com
|
|||||||
import re
|
import re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
|
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
|
||||||
from datetime import timedelta, datetime, date
|
from datetime import timedelta, date
|
||||||
|
|
||||||
class WSJ(BasicNewsRecipe):
|
class WSJ(BasicNewsRecipe):
|
||||||
# formatting adapted from original recipe by Kovid Goyal and Sujata Raman
|
# formatting adapted from original recipe by Kovid Goyal and Sujata Raman
|
||||||
@ -74,16 +74,33 @@ class WSJ(BasicNewsRecipe):
|
|||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self,soup):
|
def preprocess_html(self,soup):
|
||||||
|
|
||||||
|
def decode_us_date(datestr):
|
||||||
|
udate = datestr.strip().lower().split()
|
||||||
|
m = ['january','february','march','april','may','june','july','august','september','october','november','december'].index(udate[0])+1
|
||||||
|
d = int(udate[1])
|
||||||
|
y = int(udate[2])
|
||||||
|
return date(y,m,d)
|
||||||
|
|
||||||
|
# check if article is paid content
|
||||||
|
if self.omit_paid_content:
|
||||||
|
divtags = soup.findAll('div','tooltip')
|
||||||
|
if divtags:
|
||||||
|
for divtag in divtags:
|
||||||
|
if divtag.find(text="Subscriber Content"):
|
||||||
|
return None
|
||||||
|
|
||||||
# check if article is too old
|
# check if article is too old
|
||||||
datetag = soup.find('li',attrs={'class' : re.compile("^dateStamp")})
|
datetag = soup.find('li',attrs={'class' : re.compile("^dateStamp")})
|
||||||
if datetag:
|
if datetag:
|
||||||
dateline_string = self.tag_to_string(datetag,False)
|
dateline_string = self.tag_to_string(datetag,False)
|
||||||
date_items = dateline_string.split(',')
|
date_items = dateline_string.split(',')
|
||||||
datestring = date_items[0]+date_items[1]
|
datestring = date_items[0]+date_items[1]
|
||||||
article_date = datetime.strptime(datestring.title(),"%B %d %Y")
|
article_date = decode_us_date(datestring)
|
||||||
earliest_date = date.today() - timedelta(days=self.oldest_article)
|
earliest_date = date.today() - timedelta(days=self.oldest_article)
|
||||||
if article_date.date() < earliest_date:
|
if article_date < earliest_date:
|
||||||
self.log("Skipping article dated %s" % datestring)
|
self.log("Skipping article dated %s" % datestring)
|
||||||
return None
|
return None
|
||||||
datetag.parent.extract()
|
datetag.parent.extract()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user