News download: Add support for images embedded in the HTML

This commit is contained in:
Kovid Goyal 2012-06-24 08:14:33 +05:30
parent f187d5f7e8
commit a7fe71a54c

View File

@ -12,6 +12,7 @@ from urllib import url2pathname, quote
from httplib import responses from httplib import responses
from PIL import Image from PIL import Image
from cStringIO import StringIO from cStringIO import StringIO
from base64 import b64decode
from calibre import browser, relpath, unicode_path from calibre import browser, relpath, unicode_path
from calibre.constants import filesystem_encoding, iswindows from calibre.constants import filesystem_encoding, iswindows
@ -346,22 +347,29 @@ class RecursiveFetcher(object):
c = 0 c = 0
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src'] iurl = tag['src']
if callable(self.image_url_processor): if iurl.startswith('data:image/'):
iurl = self.image_url_processor(baseurl, iurl) try:
if not urlparse.urlsplit(iurl).scheme: data = b64decode(iurl.partition(',')[-1])
iurl = urlparse.urljoin(baseurl, iurl, False) except:
with self.imagemap_lock: self.log.exception('Failed to decode embedded image')
if self.imagemap.has_key(iurl):
tag['src'] = self.imagemap[iurl]
continue continue
try: else:
data = self.fetch_url(iurl) if callable(self.image_url_processor):
if data == 'GIF89a\x01': iurl = self.image_url_processor(baseurl, iurl)
# Skip empty GIF files as PIL errors on them anyway if not urlparse.urlsplit(iurl).scheme:
iurl = urlparse.urljoin(baseurl, iurl, False)
with self.imagemap_lock:
if self.imagemap.has_key(iurl):
tag['src'] = self.imagemap[iurl]
continue
try:
data = self.fetch_url(iurl)
if data == 'GIF89a\x01':
# Skip empty GIF files as PIL errors on them anyway
continue
except Exception:
self.log.exception('Could not fetch image ', iurl)
continue continue
except Exception:
self.log.exception('Could not fetch image ', iurl)
continue
c += 1 c += 1
fname = ascii_filename('img'+str(c)) fname = ascii_filename('img'+str(c))
if isinstance(fname, unicode): if isinstance(fname, unicode):