Remove escaped EM tags, keeping only content

The source epub contains multiple escaped EM tags, also in title tags and the table of contents. The safest way (and easiest way) to get a readable epub from this is to dispense with the escaped EM tags altogether, everywhere in .html and .ncx files (instead of replacing them with unescaped ones), of course while keeping the human-readable contents in place.
2025-06-23 15:30:45 -04:00 · 2014-04-30 10:20:01 +02:00 · 2014-04-30 10:20:01 +02:00 · a573e71e87
commit a573e71e87
parent 93e49eb434
1 changed files with 13 additions and 1 deletions
--- a/recipes/nrc_next.recipe
+++ b/recipes/nrc_next.recipe
@ -8,7 +8,7 @@ __copyright__ = '2014, Niels Giesen'
 '''
 www.nrc.nl
 '''
-import os, zipfile
+import os, zipfile, re
 from io import BytesIO
 from calibre.web.feeds.news import BasicNewsRecipe
@ -57,5 +57,17 @@ class NRCNext(BasicNewsRecipe):
        zfile = zipfile.ZipFile(BytesIO(epubraw), 'r')
        zfile.extractall(self.output_dir)
        namelist = zfile.namelist()
        emre = re.compile("&lt;em(?:.*)&gt;(.*)&lt;/em&gt;")
        subst = '\\1'
        for name in namelist:
            _, ext = os.path.splitext(name);
            if (ext == '.html') or (ext == '.ncx'):
                fname = os.path.join(self.output_dir, name)
                with open(fname) as f:
                    s = f.read()
                    s = emre.sub(subst, s)
                with open(fname, 'w') as f:
                    f.write(s)
        index = os.path.join(self.output_dir, 'metadata.opf')
        return index