This commit is contained in:
Kovid Goyal 2010-01-21 13:11:37 -07:00
parent d83a9104fd
commit b3282b3ac5

View File

@ -8,7 +8,7 @@ online.wsj.com
import re import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString from calibre.ebooks.BeautifulSoup import Tag, NavigableString
from datetime import timedelta, datetime, date from datetime import timedelta, date
class WSJ(BasicNewsRecipe): class WSJ(BasicNewsRecipe):
# formatting adapted from original recipe by Kovid Goyal and Sujata Raman # formatting adapted from original recipe by Kovid Goyal and Sujata Raman
@ -74,16 +74,33 @@ class WSJ(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
return br return br
def preprocess_html(self,soup): def preprocess_html(self,soup):
def decode_us_date(datestr):
udate = datestr.strip().lower().split()
m = ['january','february','march','april','may','june','july','august','september','october','november','december'].index(udate[0])+1
d = int(udate[1])
y = int(udate[2])
return date(y,m,d)
# check if article is paid content
if self.omit_paid_content:
divtags = soup.findAll('div','tooltip')
if divtags:
for divtag in divtags:
if divtag.find(text="Subscriber Content"):
return None
# check if article is too old # check if article is too old
datetag = soup.find('li',attrs={'class' : re.compile("^dateStamp")}) datetag = soup.find('li',attrs={'class' : re.compile("^dateStamp")})
if datetag: if datetag:
dateline_string = self.tag_to_string(datetag,False) dateline_string = self.tag_to_string(datetag,False)
date_items = dateline_string.split(',') date_items = dateline_string.split(',')
datestring = date_items[0]+date_items[1] datestring = date_items[0]+date_items[1]
article_date = datetime.strptime(datestring.title(),"%B %d %Y") article_date = decode_us_date(datestring)
earliest_date = date.today() - timedelta(days=self.oldest_article) earliest_date = date.today() - timedelta(days=self.oldest_article)
if article_date.date() < earliest_date: if article_date < earliest_date:
self.log("Skipping article dated %s" % datestring) self.log("Skipping article dated %s" % datestring)
return None return None
datetag.parent.extract() datetag.parent.extract()