EPUB output: Fix parsing of HTML documents that contain spurious 0 bytes. This fixes Time recipe.

2025-07-09 03:04:10 -04:00 · 2008-12-21 22:02:24 -08:00 · 2008-12-21 22:02:24 -08:00 · af1319138a
commit af1319138a
parent 597448a793
2 changed files with 10 additions and 2 deletions
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@ -447,6 +447,7 @@ class Parser(PreProcessor, LoggingInterface):
        ''' Create lxml ElementTree from HTML '''
        self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
        src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace').strip()
+        src = src.replace('\x00', '')
        src = self.preprocess(src)
        # lxml chokes on unicode input when it contains encoding declarations
        for pat in ENCODING_PATS:
--- a/src/calibre/web/feeds/recipes/time.py
+++ b/src/calibre/web/feeds/recipes/time.py
@ -18,7 +18,7 @@ class Time(BasicNewsRecipe):
    no_stylesheets        = False
    use_embedded_content  = False
    
-    cover_url = 'http://img.timeinc.net/time/rd/trunk/www/web/feds/i/logo_time_home.gif'
+    #cover_url = 'http://img.timeinc.net/time/rd/trunk/www/web/feds/i/logo_time_home.gif'
    
    keep_only_tags = [dict(name='div', attrs={'class':'tout1'})]

@ -33,6 +33,13 @@ class Time(BasicNewsRecipe):
                       ,(u'Travel', u'http://feedproxy.google.com/time/travel')
                     ]
    
+    def get_cover_url(self):
+        soup = self.index_to_soup('http://www.time.com/time/')
+        img = soup.find('img', alt='Current Time.com Cover', width='107')
+        if img is not None:
+            return img.get('src', None)
+        
+    
    def print_version(self, url):
        raw = self.browser.open(url).read()
        soup = BeautifulSoup(raw.decode('utf8', 'replace'))