mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
EPUB output: Fix parsing of HTML documents that contain spurious 0 bytes. This fixes Time recipe.
This commit is contained in:
parent
597448a793
commit
af1319138a
@ -447,6 +447,7 @@ class Parser(PreProcessor, LoggingInterface):
|
|||||||
''' Create lxml ElementTree from HTML '''
|
''' Create lxml ElementTree from HTML '''
|
||||||
self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
|
self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
|
||||||
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace').strip()
|
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace').strip()
|
||||||
|
src = src.replace('\x00', '')
|
||||||
src = self.preprocess(src)
|
src = self.preprocess(src)
|
||||||
# lxml chokes on unicode input when it contains encoding declarations
|
# lxml chokes on unicode input when it contains encoding declarations
|
||||||
for pat in ENCODING_PATS:
|
for pat in ENCODING_PATS:
|
||||||
|
@ -18,7 +18,7 @@ class Time(BasicNewsRecipe):
|
|||||||
no_stylesheets = False
|
no_stylesheets = False
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
|
||||||
cover_url = 'http://img.timeinc.net/time/rd/trunk/www/web/feds/i/logo_time_home.gif'
|
#cover_url = 'http://img.timeinc.net/time/rd/trunk/www/web/feds/i/logo_time_home.gif'
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':'tout1'})]
|
keep_only_tags = [dict(name='div', attrs={'class':'tout1'})]
|
||||||
|
|
||||||
@ -33,6 +33,13 @@ class Time(BasicNewsRecipe):
|
|||||||
,(u'Travel', u'http://feedproxy.google.com/time/travel')
|
,(u'Travel', u'http://feedproxy.google.com/time/travel')
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
soup = self.index_to_soup('http://www.time.com/time/')
|
||||||
|
img = soup.find('img', alt='Current Time.com Cover', width='107')
|
||||||
|
if img is not None:
|
||||||
|
return img.get('src', None)
|
||||||
|
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
raw = self.browser.open(url).read()
|
raw = self.browser.open(url).read()
|
||||||
soup = BeautifulSoup(raw.decode('utf8', 'replace'))
|
soup = BeautifulSoup(raw.decode('utf8', 'replace'))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user