EPUB Output: Fix regression in handling of comments that would occassionally cause content in the comments to leak into the text

2025-06-23 15:30:45 -04:00 · 2009-02-20 10:17:20 -08:00 · 2009-02-20 10:17:20 -08:00 · 8e248482aa
commit 8e248482aa
parent 29d842db3e
2 changed files with 10 additions and 9 deletions
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -205,9 +205,8 @@ class HTMLProcessor(Processor, Rationalizer):
    def save(self):
        for meta in list(self.root.xpath('//meta')):
            meta.getparent().remove(meta)
-        #for img in self.root.xpath('//img[@src]'):
-        #    self.convert_image(img)
-        Processor.save(self)
+        # Strip all comments since Adobe DE is petrified of them
+        Processor.save(self, strip_comments=True)
        
    def remove_first_image(self):
        images = self.root.xpath('//img')
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@ -332,8 +332,6 @@ class PreProcessor(object):
                  (re.compile(r'&(\S+?);'), convert_entities),
                  # Remove the <![if/endif tags inserted by everybody's darling, MS Word
                  (re.compile(r'(?i)<{0,1}!\[(end){0,1}if[^>]*>'), lambda match: ''),
-                  # Strip all comments since Adobe DE is petrified of them
-                  (re.compile(r'<!--[^>]*>'), lambda match : ''),
                  ]
                     
    # Fix pdftohtml markup
@ -447,7 +445,7 @@ class Parser(PreProcessor, LoggingInterface):
    def save_path(self):
        return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path])
    
-    def save(self):
+    def save(self, strip_comments=False):
        '''
        Save processed HTML into the content directory.
        Should be called after all HTML processing is finished.
@ -458,7 +456,11 @@ class Parser(PreProcessor, LoggingInterface):
            svg.set('xmlns', 'http://www.w3.org/2000/svg')
        
        ans = tostring(self.root, pretty_print=self.opts.pretty_print)
-        ans = re.compile(r'<head>', re.IGNORECASE).sub('<head>\n\t<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n', ans[:1000])+ans[1000:]
+        ans = re.compile(r'<head>', re.IGNORECASE).sub(
+            '<head>\n\t<meta http-equiv="Content-Type" '
+            'content="text/html; charset=utf-8" />\n', ans[:1000])+ans[1000:]
+        if strip_comments:
+            ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans)
        with open(self.save_path(), 'wb') as f:
            f.write(ans)
            return f.name
@ -594,7 +596,7 @@ class Processor(Parser):
                mark = etree.Element('hr', style=page_break_before)
            elem.addprevious(mark)
    
-    def save(self):
+    def save(self, strip_comments=False):
        style_path = os.path.splitext(os.path.basename(self.save_path()))[0]
        for i, sheet in enumerate([self.stylesheet, self.font_css, self.override_css]):
            if sheet is not None:
@ -608,7 +610,7 @@ class Processor(Parser):
                if isinstance(raw, unicode):
                    raw = raw.encode('utf-8')
                open(path, 'wb').write(raw)
-        return Parser.save(self)
+        return Parser.save(self, strip_comments=strip_comments)
    
    def populate_toc(self, toc):
        '''