From a573e71e87a5f9417f8dfdc5a4e20f72b47b57a5 Mon Sep 17 00:00:00 2001 From: Niels Giesen Date: Wed, 30 Apr 2014 10:20:01 +0200 Subject: [PATCH] Remove escaped EM tags, keeping only content The source epub contains multiple escaped EM tags, also in title tags and the table of contents. The safest way (and easiest way) to get a readable epub from this is to dispense with the escaped EM tags altogether, everywhere in .html and .ncx files (instead of replacing them with unescaped ones), of course while keeping the human-readable contents in place. --- recipes/nrc_next.recipe | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/recipes/nrc_next.recipe b/recipes/nrc_next.recipe index c630296595..5cbe23a6a4 100644 --- a/recipes/nrc_next.recipe +++ b/recipes/nrc_next.recipe @@ -8,7 +8,7 @@ __copyright__ = '2014, Niels Giesen' ''' www.nrc.nl ''' -import os, zipfile +import os, zipfile, re from io import BytesIO from calibre.web.feeds.news import BasicNewsRecipe @@ -57,5 +57,17 @@ class NRCNext(BasicNewsRecipe): zfile = zipfile.ZipFile(BytesIO(epubraw), 'r') zfile.extractall(self.output_dir) + namelist = zfile.namelist() + emre = re.compile("<em(?:.*)>(.*)</em>") + subst = '\\1' + for name in namelist: + _, ext = os.path.splitext(name); + if (ext == '.html') or (ext == '.ncx'): + fname = os.path.join(self.output_dir, name) + with open(fname) as f: + s = f.read() + s = emre.sub(subst, s) + with open(fname, 'w') as f: + f.write(s) index = os.path.join(self.output_dir, 'metadata.opf') return index