Remove escaped EM tags, keeping only content

The source epub contains multiple escaped EM tags, also in title tags
and the table of contents. The safest way (and easiest way) to get a
readable epub from this is to dispense with the escaped EM tags
altogether, everywhere in .html and .ncx files (instead of replacing
them with unescaped ones), of course while keeping the human-readable
contents in place.
This commit is contained in:
Niels Giesen 2014-04-30 10:20:01 +02:00
parent 93e49eb434
commit a573e71e87

View File

@ -8,7 +8,7 @@ __copyright__ = '2014, Niels Giesen'
''' '''
www.nrc.nl www.nrc.nl
''' '''
import os, zipfile import os, zipfile, re
from io import BytesIO from io import BytesIO
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -57,5 +57,17 @@ class NRCNext(BasicNewsRecipe):
zfile = zipfile.ZipFile(BytesIO(epubraw), 'r') zfile = zipfile.ZipFile(BytesIO(epubraw), 'r')
zfile.extractall(self.output_dir) zfile.extractall(self.output_dir)
namelist = zfile.namelist()
emre = re.compile("<em(?:.*)>(.*)</em>")
subst = '\\1'
for name in namelist:
_, ext = os.path.splitext(name);
if (ext == '.html') or (ext == '.ncx'):
fname = os.path.join(self.output_dir, name)
with open(fname) as f:
s = f.read()
s = emre.sub(subst, s)
with open(fname, 'w') as f:
f.write(s)
index = os.path.join(self.output_dir, 'metadata.opf') index = os.path.join(self.output_dir, 'metadata.opf')
return index return index