EPUB Output: Fix regression in handling of comments that would occassionally cause content in the comments to leak into the text

This commit is contained in:
Kovid Goyal 2009-02-20 10:17:20 -08:00
parent 29d842db3e
commit 8e248482aa
2 changed files with 10 additions and 9 deletions

View File

@ -205,9 +205,8 @@ class HTMLProcessor(Processor, Rationalizer):
def save(self):
for meta in list(self.root.xpath('//meta')):
meta.getparent().remove(meta)
#for img in self.root.xpath('//img[@src]'):
# self.convert_image(img)
Processor.save(self)
# Strip all comments since Adobe DE is petrified of them
Processor.save(self, strip_comments=True)
def remove_first_image(self):
images = self.root.xpath('//img')

View File

@ -332,8 +332,6 @@ class PreProcessor(object):
(re.compile(r'&(\S+?);'), convert_entities),
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
(re.compile(r'(?i)<{0,1}!\[(end){0,1}if[^>]*>'), lambda match: ''),
# Strip all comments since Adobe DE is petrified of them
(re.compile(r'<!--[^>]*>'), lambda match : ''),
]
# Fix pdftohtml markup
@ -447,7 +445,7 @@ class Parser(PreProcessor, LoggingInterface):
def save_path(self):
return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path])
def save(self):
def save(self, strip_comments=False):
'''
Save processed HTML into the content directory.
Should be called after all HTML processing is finished.
@ -458,7 +456,11 @@ class Parser(PreProcessor, LoggingInterface):
svg.set('xmlns', 'http://www.w3.org/2000/svg')
ans = tostring(self.root, pretty_print=self.opts.pretty_print)
ans = re.compile(r'<head>', re.IGNORECASE).sub('<head>\n\t<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n', ans[:1000])+ans[1000:]
ans = re.compile(r'<head>', re.IGNORECASE).sub(
'<head>\n\t<meta http-equiv="Content-Type" '
'content="text/html; charset=utf-8" />\n', ans[:1000])+ans[1000:]
if strip_comments:
ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans)
with open(self.save_path(), 'wb') as f:
f.write(ans)
return f.name
@ -594,7 +596,7 @@ class Processor(Parser):
mark = etree.Element('hr', style=page_break_before)
elem.addprevious(mark)
def save(self):
def save(self, strip_comments=False):
style_path = os.path.splitext(os.path.basename(self.save_path()))[0]
for i, sheet in enumerate([self.stylesheet, self.font_css, self.override_css]):
if sheet is not None:
@ -608,7 +610,7 @@ class Processor(Parser):
if isinstance(raw, unicode):
raw = raw.encode('utf-8')
open(path, 'wb').write(raw)
return Parser.save(self)
return Parser.save(self, strip_comments=strip_comments)
def populate_toc(self, toc):
'''