From a573e71e87a5f9417f8dfdc5a4e20f72b47b57a5 Mon Sep 17 00:00:00 2001
From: Niels Giesen <niels.giesen@gmail.com>
Date: Wed, 30 Apr 2014 10:20:01 +0200
Subject: [PATCH] Remove escaped EM tags, keeping only content

The source epub contains multiple escaped EM tags, also in title tags
and the table of contents. The safest way (and easiest way) to get a
readable epub from this is to dispense with the escaped EM tags
altogether, everywhere in .html and .ncx files (instead of replacing
them with unescaped ones), of course while keeping the human-readable
contents in place.
---
 recipes/nrc_next.recipe | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/recipes/nrc_next.recipe b/recipes/nrc_next.recipe
index c630296595..5cbe23a6a4 100644
--- a/recipes/nrc_next.recipe
+++ b/recipes/nrc_next.recipe
@@ -8,7 +8,7 @@ __copyright__ = '2014, Niels Giesen'
 '''
 www.nrc.nl
 '''
-import os, zipfile
+import os, zipfile, re
 from io import BytesIO
 
 from calibre.web.feeds.news import BasicNewsRecipe
@@ -57,5 +57,17 @@ class NRCNext(BasicNewsRecipe):
 
         zfile = zipfile.ZipFile(BytesIO(epubraw), 'r')
         zfile.extractall(self.output_dir)
+        namelist = zfile.namelist()
+        emre = re.compile("&lt;em(?:.*)&gt;(.*)&lt;/em&gt;")
+        subst = '\\1'
+        for name in namelist:
+            _, ext = os.path.splitext(name);
+            if (ext == '.html') or (ext == '.ncx'):
+                fname = os.path.join(self.output_dir, name)
+                with open(fname) as f:
+                    s = f.read()
+                    s = emre.sub(subst, s)
+                with open(fname, 'w') as f:
+                    f.write(s)
         index = os.path.join(self.output_dir, 'metadata.opf')
         return index