From af1319138aa0730c8d1f0ceef575b93a1bb99ba5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 21 Dec 2008 22:02:24 -0800 Subject: [PATCH] EPUB output: Fix parsing of HTML documents that contain spurious 0 bytes. This fixes Time recipe. --- src/calibre/ebooks/html.py | 1 + src/calibre/web/feeds/recipes/time.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index 8c5cc6f8a4..e564f88418 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -447,6 +447,7 @@ class Parser(PreProcessor, LoggingInterface): ''' Create lxml ElementTree from HTML ''' self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:])) src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace').strip() + src = src.replace('\x00', '') src = self.preprocess(src) # lxml chokes on unicode input when it contains encoding declarations for pat in ENCODING_PATS: diff --git a/src/calibre/web/feeds/recipes/time.py b/src/calibre/web/feeds/recipes/time.py index 17df0c32e9..83fe4ab2dc 100644 --- a/src/calibre/web/feeds/recipes/time.py +++ b/src/calibre/web/feeds/recipes/time.py @@ -18,7 +18,7 @@ class Time(BasicNewsRecipe): no_stylesheets = False use_embedded_content = False - cover_url = 'http://img.timeinc.net/time/rd/trunk/www/web/feds/i/logo_time_home.gif' + #cover_url = 'http://img.timeinc.net/time/rd/trunk/www/web/feds/i/logo_time_home.gif' keep_only_tags = [dict(name='div', attrs={'class':'tout1'})] @@ -32,7 +32,14 @@ class Time(BasicNewsRecipe): ,(u'Politics', u'http://feedproxy.google.com/time/politics') ,(u'Travel', u'http://feedproxy.google.com/time/travel') ] - + + def get_cover_url(self): + soup = self.index_to_soup('http://www.time.com/time/') + img = soup.find('img', alt='Current Time.com Cover', width='107') + if img is not None: + return img.get('src', None) + + def print_version(self, url): raw = self.browser.open(url).read() soup = BeautifulSoup(raw.decode('utf8', 'replace'))