mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Remove escaped EM tags, keeping only content
The source epub contains multiple escaped EM tags, also in title tags and the table of contents. The safest way (and easiest way) to get a readable epub from this is to dispense with the escaped EM tags altogether, everywhere in .html and .ncx files (instead of replacing them with unescaped ones), of course while keeping the human-readable contents in place.
This commit is contained in:
parent
93e49eb434
commit
a573e71e87
@ -8,7 +8,7 @@ __copyright__ = '2014, Niels Giesen'
|
|||||||
'''
|
'''
|
||||||
www.nrc.nl
|
www.nrc.nl
|
||||||
'''
|
'''
|
||||||
import os, zipfile
|
import os, zipfile, re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
@ -57,5 +57,17 @@ class NRCNext(BasicNewsRecipe):
|
|||||||
|
|
||||||
zfile = zipfile.ZipFile(BytesIO(epubraw), 'r')
|
zfile = zipfile.ZipFile(BytesIO(epubraw), 'r')
|
||||||
zfile.extractall(self.output_dir)
|
zfile.extractall(self.output_dir)
|
||||||
|
namelist = zfile.namelist()
|
||||||
|
emre = re.compile("<em(?:.*)>(.*)</em>")
|
||||||
|
subst = '\\1'
|
||||||
|
for name in namelist:
|
||||||
|
_, ext = os.path.splitext(name);
|
||||||
|
if (ext == '.html') or (ext == '.ncx'):
|
||||||
|
fname = os.path.join(self.output_dir, name)
|
||||||
|
with open(fname) as f:
|
||||||
|
s = f.read()
|
||||||
|
s = emre.sub(subst, s)
|
||||||
|
with open(fname, 'w') as f:
|
||||||
|
f.write(s)
|
||||||
index = os.path.join(self.output_dir, 'metadata.opf')
|
index = os.path.join(self.output_dir, 'metadata.opf')
|
||||||
return index
|
return index
|
||||||
|
Loading…
x
Reference in New Issue
Block a user