This commit is contained in:
Kovid Goyal 2007-11-28 01:49:23 +00:00
parent 6674582247
commit f69e43e986

View File

@ -34,6 +34,7 @@ class DefaultProfile(object):
delay = 0 # Delay between consecutive downloads delay = 0 # Delay between consecutive downloads
timeout = 10 # Timeout for fetching files from server in seconds timeout = 10 # Timeout for fetching files from server in seconds
timefmt = ' [%a %d %b %Y]' # The format of the date shown on the first page timefmt = ' [%a %d %b %Y]' # The format of the date shown on the first page
pubdate_fmt = None # The format string used to parse the publication date in the RSS feed. If set to None some default heuristics are used, these may fail, in which case set this to the correct string or re-implement strptime in your subclass.
no_stylesheets = False # Download stylesheets only if False no_stylesheets = False # Download stylesheets only if False
match_regexps = [] # List of regular expressions that determines which links to follow match_regexps = [] # List of regular expressions that determines which links to follow
filter_regexps = [] # List of regular expressions that determines which links to ignore filter_regexps = [] # List of regular expressions that determines which links to ignore
@ -163,13 +164,23 @@ class DefaultProfile(object):
soup = BeautifulStoneSoup(src) soup = BeautifulStoneSoup(src)
for item in soup.findAll('item'): for item in soup.findAll('item'):
try: try:
pubdate = item.find('pubdate').string pubdate = item.find('pubdate')
if not pubdate: if not pubdate:
pubdate = item.find('dc:date')
if not pubdate or not pubdate.string:
self.logger.debug('Skipping article as it does not have publication date')
continue continue
pubdate = pubdate.string
pubdate = pubdate.replace('+0000', 'GMT') pubdate = pubdate.replace('+0000', 'GMT')
url = item.find('guid')
if not url:
url = item.find('link')
if not url or not url.string:
self.logger.debug('Skipping article as it does not have a link url')
continue
d = { d = {
'title' : item.find('title').string, 'title' : item.find('title').string,
'url' : self.print_version(item.find('guid').string), 'url' : self.print_version(url.string),
'timestamp': self.strptime(pubdate), 'timestamp': self.strptime(pubdate),
'date' : pubdate 'date' : pubdate
} }
@ -215,6 +226,14 @@ class DefaultProfile(object):
@classmethod @classmethod
def strptime(cls, src): def strptime(cls, src):
delta = 0
zone = re.search(r'\s*(\+\d\d\:{0,1}\d\d)', src)
if zone:
delta = zone.group(1)
hrs, mins = int(delta[1:3]), int(delta[-2:].rstrip())
delta = 60*(hrs*60 + mins) * (-1 if delta.startswith('-') else 1)
src = src.replace(zone.group(), '')
if cls.pubdate_fmt is None:
src = src.strip().split() src = src.strip().split()
try: try:
src[0] = str(cls.DAY_MAP[src[0][:-1]])+',' src[0] = str(cls.DAY_MAP[src[0][:-1]])+','
@ -225,17 +244,14 @@ class DefaultProfile(object):
except KeyError: except KeyError:
src[2] = str(cls.FULL_MONTH_MAP[src[2]]) src[2] = str(cls.FULL_MONTH_MAP[src[2]])
fmt = '%w, %d %m %Y %H:%M:%S' fmt = '%w, %d %m %Y %H:%M:%S'
delta = 0 src = src[:5] # Discard extra information
if src[-1].startswith('+') or src[-1].startswith('-'):
delta = src[-1]
hrs, mins = int(delta[1:3]), int(delta[3:5])
delta = 60*(hrs*60 + mins) * (-1 if delta.startswith('-') else 1)
src = src[:-1] # Discard timezone information
try: try:
time_t = time.strptime(' '.join(src), fmt) time_t = time.strptime(' '.join(src), fmt)
except ValueError: except ValueError:
time_t = time.strptime(' '.join(src), fmt.replace('%Y', '%y')) time_t = time.strptime(' '.join(src), fmt.replace('%Y', '%y'))
return calendar.timegm(time_t)-delta return calendar.timegm(time_t)-delta
else:
return calendar.timegm(time.strptime(src, cls.pubdate_fmt))
def command_line_options(self): def command_line_options(self):
args = [] args = []