EPUB output: Fix parsing of HTML documents that contain spurious 0 bytes. This fixes Time recipe.

This commit is contained in:
Kovid Goyal 2008-12-21 22:02:24 -08:00
parent 597448a793
commit af1319138a
2 changed files with 10 additions and 2 deletions

View File

@ -447,6 +447,7 @@ class Parser(PreProcessor, LoggingInterface):
''' Create lxml ElementTree from HTML '''
self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace').strip()
src = src.replace('\x00', '')
src = self.preprocess(src)
# lxml chokes on unicode input when it contains encoding declarations
for pat in ENCODING_PATS:

View File

@ -18,7 +18,7 @@ class Time(BasicNewsRecipe):
no_stylesheets = False
use_embedded_content = False
cover_url = 'http://img.timeinc.net/time/rd/trunk/www/web/feds/i/logo_time_home.gif'
#cover_url = 'http://img.timeinc.net/time/rd/trunk/www/web/feds/i/logo_time_home.gif'
keep_only_tags = [dict(name='div', attrs={'class':'tout1'})]
@ -33,6 +33,13 @@ class Time(BasicNewsRecipe):
,(u'Travel', u'http://feedproxy.google.com/time/travel')
]
def get_cover_url(self):
soup = self.index_to_soup('http://www.time.com/time/')
img = soup.find('img', alt='Current Time.com Cover', width='107')
if img is not None:
return img.get('src', None)
def print_version(self, url):
raw = self.browser.open(url).read()
soup = BeautifulSoup(raw.decode('utf8', 'replace'))