diff --git a/src/calibre/ebooks/conversion/plugins/fb2_input.py b/src/calibre/ebooks/conversion/plugins/fb2_input.py index 61f36fd458..424aa0b7e3 100644 --- a/src/calibre/ebooks/conversion/plugins/fb2_input.py +++ b/src/calibre/ebooks/conversion/plugins/fb2_input.py @@ -36,6 +36,7 @@ class FB2Input(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): from lxml import etree + from calibre.ebooks.metadata.fb2 import ensure_namespace from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER @@ -57,10 +58,12 @@ class FB2Input(InputFormatPlugin): parser=RECOVER_PARSER) if doc is None: raise ValueError('The FB2 file is not valid XML') + doc = ensure_namespace(doc) try: fb_ns = doc.nsmap[doc.prefix] except Exception: fb_ns = FB2NS + NAMESPACES = {'f':fb_ns, 'l':XLINK_NS} stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]') css = '' diff --git a/src/calibre/ebooks/metadata/fb2.py b/src/calibre/ebooks/metadata/fb2.py index d1b5fe1560..829eb7e852 100644 --- a/src/calibre/ebooks/metadata/fb2.py +++ b/src/calibre/ebooks/metadata/fb2.py @@ -278,7 +278,7 @@ def _get_fbroot(stream): raw = stream.read() raw = xml_to_unicode(raw, strip_encoding_pats=True)[0] root = etree.fromstring(raw, parser=parser) - return root + return ensure_namespace(root) def _set_title(title_info, mi, ctx): if not mi.is_null('title'): @@ -381,3 +381,19 @@ def set_metadata(stream, mi, apply_null=False, update_timestamp=False): stream.write(b'\n') stream.write(etree.tostring(root, method='xml', encoding='utf-8', xml_declaration=False)) + +def ensure_namespace(doc): + # Workaround for broken FB2 files produced by convertonlinefree.com. See + # https://bugs.launchpad.net/bugs/1404701 + bare_tags = False + for x in ('description', 'body'): + for x in doc.findall(x): + if '{' not in x.tag: + bare_tags = True + break + if bare_tags: + import re + raw = etree.tostring(doc, encoding=unicode) + raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>', raw) + doc = etree.fromstring(raw) + return doc