mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
EPUB output: Fix parsing of HTML documents that contain spurious 0 bytes. This fixes Time recipe.
This commit is contained in:
parent
597448a793
commit
af1319138a
@ -447,6 +447,7 @@ class Parser(PreProcessor, LoggingInterface):
|
||||
''' Create lxml ElementTree from HTML '''
|
||||
self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
|
||||
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace').strip()
|
||||
src = src.replace('\x00', '')
|
||||
src = self.preprocess(src)
|
||||
# lxml chokes on unicode input when it contains encoding declarations
|
||||
for pat in ENCODING_PATS:
|
||||
|
@ -18,7 +18,7 @@ class Time(BasicNewsRecipe):
|
||||
no_stylesheets = False
|
||||
use_embedded_content = False
|
||||
|
||||
cover_url = 'http://img.timeinc.net/time/rd/trunk/www/web/feds/i/logo_time_home.gif'
|
||||
#cover_url = 'http://img.timeinc.net/time/rd/trunk/www/web/feds/i/logo_time_home.gif'
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'tout1'})]
|
||||
|
||||
@ -32,7 +32,14 @@ class Time(BasicNewsRecipe):
|
||||
,(u'Politics', u'http://feedproxy.google.com/time/politics')
|
||||
,(u'Travel', u'http://feedproxy.google.com/time/travel')
|
||||
]
|
||||
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.time.com/time/')
|
||||
img = soup.find('img', alt='Current Time.com Cover', width='107')
|
||||
if img is not None:
|
||||
return img.get('src', None)
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
raw = self.browser.open(url).read()
|
||||
soup = BeautifulSoup(raw.decode('utf8', 'replace'))
|
||||
|
Loading…
x
Reference in New Issue
Block a user