EPUB output: Fix parsing of HTML documents that contain spurious 0 bytes. This fixes Time recipe.

2026-06-07 06:25:26 -04:00 · 2008-12-21 22:02:24 -08:00
parent 597448a793
commit af1319138a
2 changed files with 10 additions and 2 deletions
@@ -447,6 +447,7 @@ class Parser(PreProcessor, LoggingInterface):
        ''' Create lxml ElementTree from HTML '''
        self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
        src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace').strip()
+        src = src.replace('\x00', '')
        src = self.preprocess(src)
        # lxml chokes on unicode input when it contains encoding declarations
        for pat in ENCODING_PATS:
@@ -18,7 +18,7 @@ class Time(BasicNewsRecipe):
    no_stylesheets        = False
    use_embedded_content  = False
    
-    cover_url = 'http://img.timeinc.net/time/rd/trunk/www/web/feds/i/logo_time_home.gif'
+    #cover_url = 'http://img.timeinc.net/time/rd/trunk/www/web/feds/i/logo_time_home.gif'
    
    keep_only_tags = [dict(name='div', attrs={'class':'tout1'})]

@@ -32,7 +32,14 @@ class Time(BasicNewsRecipe):
                       ,(u'Politics', u'http://feedproxy.google.com/time/politics')
                       ,(u'Travel', u'http://feedproxy.google.com/time/travel')
                     ]
-
+    
+    def get_cover_url(self):
+        soup = self.index_to_soup('http://www.time.com/time/')
+        img = soup.find('img', alt='Current Time.com Cover', width='107')
+        if img is not None:
+            return img.get('src', None)
+        
+    
    def print_version(self, url):
        raw = self.browser.open(url).read()
        soup = BeautifulSoup(raw.decode('utf8', 'replace'))