From e2c6d4e5c0eb128c50cae70277cf4c73281d2e86 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 22 Dec 2014 09:30:12 +0530 Subject: [PATCH] FB2 Input: Workaround for broken FB2 files produced by convertonlinefree.com. See #1404701 (Does not display some fb2 books) --- .../ebooks/conversion/plugins/fb2_input.py | 3 +++ src/calibre/ebooks/metadata/fb2.py | 18 +++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/plugins/fb2_input.py b/src/calibre/ebooks/conversion/plugins/fb2_input.py index 61f36fd458..424aa0b7e3 100644 --- a/src/calibre/ebooks/conversion/plugins/fb2_input.py +++ b/src/calibre/ebooks/conversion/plugins/fb2_input.py @@ -36,6 +36,7 @@ class FB2Input(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): from lxml import etree + from calibre.ebooks.metadata.fb2 import ensure_namespace from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER @@ -57,10 +58,12 @@ class FB2Input(InputFormatPlugin): parser=RECOVER_PARSER) if doc is None: raise ValueError('The FB2 file is not valid XML') + doc = ensure_namespace(doc) try: fb_ns = doc.nsmap[doc.prefix] except Exception: fb_ns = FB2NS + NAMESPACES = {'f':fb_ns, 'l':XLINK_NS} stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]') css = '' diff --git a/src/calibre/ebooks/metadata/fb2.py b/src/calibre/ebooks/metadata/fb2.py index d1b5fe1560..829eb7e852 100644 --- a/src/calibre/ebooks/metadata/fb2.py +++ b/src/calibre/ebooks/metadata/fb2.py @@ -278,7 +278,7 @@ def _get_fbroot(stream): raw = stream.read() raw = xml_to_unicode(raw, strip_encoding_pats=True)[0] root = etree.fromstring(raw, parser=parser) - return root + return ensure_namespace(root) def _set_title(title_info, mi, ctx): if not mi.is_null('title'): @@ -381,3 +381,19 @@ def set_metadata(stream, mi, apply_null=False, update_timestamp=False): stream.write(b'\n') stream.write(etree.tostring(root, method='xml', encoding='utf-8', xml_declaration=False)) + +def ensure_namespace(doc): + # Workaround for broken FB2 files produced by convertonlinefree.com. See + # https://bugs.launchpad.net/bugs/1404701 + bare_tags = False + for x in ('description', 'body'): + for x in doc.findall(x): + if '{' not in x.tag: + bare_tags = True + break + if bare_tags: + import re + raw = etree.tostring(doc, encoding=unicode) + raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>', raw) + doc = etree.fromstring(raw) + return doc