Add use_pubdate switch to handle feeds that dont have a pubdate

This commit is contained in:
Kovid Goyal 2007-12-02 22:27:05 +00:00
parent 0e9a8296f2
commit 2ec32b683a

View File

@ -36,6 +36,7 @@ class DefaultProfile(object):
timefmt = ' [%a %d %b %Y]' # The format of the date shown on the first page timefmt = ' [%a %d %b %Y]' # The format of the date shown on the first page
url_search_order = ['guid', 'link'] # THe order of elements to search for a URL when parssing the RSS feed url_search_order = ['guid', 'link'] # THe order of elements to search for a URL when parssing the RSS feed
pubdate_fmt = None # The format string used to parse the publication date in the RSS feed. If set to None some default heuristics are used, these may fail, in which case set this to the correct string or re-implement strptime in your subclass. pubdate_fmt = None # The format string used to parse the publication date in the RSS feed. If set to None some default heuristics are used, these may fail, in which case set this to the correct string or re-implement strptime in your subclass.
use_pubdate = True, # If True will look for a publication date for each article. If False assumes the publication date is the current time.
no_stylesheets = False # Download stylesheets only if False no_stylesheets = False # Download stylesheets only if False
match_regexps = [] # List of regular expressions that determines which links to follow match_regexps = [] # List of regular expressions that determines which links to follow
filter_regexps = [] # List of regular expressions that determines which links to ignore filter_regexps = [] # List of regular expressions that determines which links to ignore
@ -165,14 +166,15 @@ class DefaultProfile(object):
soup = BeautifulStoneSoup(src) soup = BeautifulStoneSoup(src)
for item in soup.findAll('item'): for item in soup.findAll('item'):
try: try:
pubdate = item.find('pubdate') if self.use_pubdate:
if not pubdate: pubdate = item.find('pubdate')
pubdate = item.find('dc:date') if not pubdate:
if not pubdate or not pubdate.string: pubdate = item.find('dc:date')
self.logger.debug('Skipping article as it does not have publication date') if not pubdate or not pubdate.string:
continue self.logger.debug('Skipping article as it does not have publication date')
pubdate = pubdate.string continue
pubdate = pubdate.replace('+0000', 'GMT') pubdate = pubdate.string
pubdate = pubdate.replace('+0000', 'GMT')
for element in self.url_search_order: for element in self.url_search_order:
url = item.find(element) url = item.find(element)
if url: if url:
@ -184,8 +186,8 @@ class DefaultProfile(object):
d = { d = {
'title' : item.find('title').string, 'title' : item.find('title').string,
'url' : self.print_version(url.string), 'url' : self.print_version(url.string),
'timestamp': self.strptime(pubdate), 'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
'date' : pubdate 'date' : pubdate if self.use_pubate else time.ctime()
} }
delta = time.time() - d['timestamp'] delta = time.time() - d['timestamp']
if delta > self.oldest_article*3600*24: if delta > self.oldest_article*3600*24: