mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #340
This commit is contained in:
parent
6674582247
commit
f69e43e986
@ -34,6 +34,7 @@ class DefaultProfile(object):
|
|||||||
delay = 0 # Delay between consecutive downloads
|
delay = 0 # Delay between consecutive downloads
|
||||||
timeout = 10 # Timeout for fetching files from server in seconds
|
timeout = 10 # Timeout for fetching files from server in seconds
|
||||||
timefmt = ' [%a %d %b %Y]' # The format of the date shown on the first page
|
timefmt = ' [%a %d %b %Y]' # The format of the date shown on the first page
|
||||||
|
pubdate_fmt = None # The format string used to parse the publication date in the RSS feed. If set to None some default heuristics are used, these may fail, in which case set this to the correct string or re-implement strptime in your subclass.
|
||||||
no_stylesheets = False # Download stylesheets only if False
|
no_stylesheets = False # Download stylesheets only if False
|
||||||
match_regexps = [] # List of regular expressions that determines which links to follow
|
match_regexps = [] # List of regular expressions that determines which links to follow
|
||||||
filter_regexps = [] # List of regular expressions that determines which links to ignore
|
filter_regexps = [] # List of regular expressions that determines which links to ignore
|
||||||
@ -163,13 +164,23 @@ class DefaultProfile(object):
|
|||||||
soup = BeautifulStoneSoup(src)
|
soup = BeautifulStoneSoup(src)
|
||||||
for item in soup.findAll('item'):
|
for item in soup.findAll('item'):
|
||||||
try:
|
try:
|
||||||
pubdate = item.find('pubdate').string
|
pubdate = item.find('pubdate')
|
||||||
if not pubdate:
|
if not pubdate:
|
||||||
|
pubdate = item.find('dc:date')
|
||||||
|
if not pubdate or not pubdate.string:
|
||||||
|
self.logger.debug('Skipping article as it does not have publication date')
|
||||||
continue
|
continue
|
||||||
|
pubdate = pubdate.string
|
||||||
pubdate = pubdate.replace('+0000', 'GMT')
|
pubdate = pubdate.replace('+0000', 'GMT')
|
||||||
|
url = item.find('guid')
|
||||||
|
if not url:
|
||||||
|
url = item.find('link')
|
||||||
|
if not url or not url.string:
|
||||||
|
self.logger.debug('Skipping article as it does not have a link url')
|
||||||
|
continue
|
||||||
d = {
|
d = {
|
||||||
'title' : item.find('title').string,
|
'title' : item.find('title').string,
|
||||||
'url' : self.print_version(item.find('guid').string),
|
'url' : self.print_version(url.string),
|
||||||
'timestamp': self.strptime(pubdate),
|
'timestamp': self.strptime(pubdate),
|
||||||
'date' : pubdate
|
'date' : pubdate
|
||||||
}
|
}
|
||||||
@ -215,6 +226,14 @@ class DefaultProfile(object):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def strptime(cls, src):
|
def strptime(cls, src):
|
||||||
|
delta = 0
|
||||||
|
zone = re.search(r'\s*(\+\d\d\:{0,1}\d\d)', src)
|
||||||
|
if zone:
|
||||||
|
delta = zone.group(1)
|
||||||
|
hrs, mins = int(delta[1:3]), int(delta[-2:].rstrip())
|
||||||
|
delta = 60*(hrs*60 + mins) * (-1 if delta.startswith('-') else 1)
|
||||||
|
src = src.replace(zone.group(), '')
|
||||||
|
if cls.pubdate_fmt is None:
|
||||||
src = src.strip().split()
|
src = src.strip().split()
|
||||||
try:
|
try:
|
||||||
src[0] = str(cls.DAY_MAP[src[0][:-1]])+','
|
src[0] = str(cls.DAY_MAP[src[0][:-1]])+','
|
||||||
@ -225,17 +244,14 @@ class DefaultProfile(object):
|
|||||||
except KeyError:
|
except KeyError:
|
||||||
src[2] = str(cls.FULL_MONTH_MAP[src[2]])
|
src[2] = str(cls.FULL_MONTH_MAP[src[2]])
|
||||||
fmt = '%w, %d %m %Y %H:%M:%S'
|
fmt = '%w, %d %m %Y %H:%M:%S'
|
||||||
delta = 0
|
src = src[:5] # Discard extra information
|
||||||
if src[-1].startswith('+') or src[-1].startswith('-'):
|
|
||||||
delta = src[-1]
|
|
||||||
hrs, mins = int(delta[1:3]), int(delta[3:5])
|
|
||||||
delta = 60*(hrs*60 + mins) * (-1 if delta.startswith('-') else 1)
|
|
||||||
src = src[:-1] # Discard timezone information
|
|
||||||
try:
|
try:
|
||||||
time_t = time.strptime(' '.join(src), fmt)
|
time_t = time.strptime(' '.join(src), fmt)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
time_t = time.strptime(' '.join(src), fmt.replace('%Y', '%y'))
|
time_t = time.strptime(' '.join(src), fmt.replace('%Y', '%y'))
|
||||||
return calendar.timegm(time_t)-delta
|
return calendar.timegm(time_t)-delta
|
||||||
|
else:
|
||||||
|
return calendar.timegm(time.strptime(src, cls.pubdate_fmt))
|
||||||
|
|
||||||
def command_line_options(self):
|
def command_line_options(self):
|
||||||
args = []
|
args = []
|
||||||
|
Loading…
x
Reference in New Issue
Block a user