Fix #5931 (Calibre does not convert whole LRF to EPUB)

This commit is contained in:
Kovid Goyal 2010-06-24 23:56:56 -06:00
parent 80548645de
commit 5f41fde25f
2 changed files with 13 additions and 13 deletions

View File

@ -367,7 +367,7 @@ class LRFInput(InputFormatPlugin):
xml = d.to_xml(write_files=True) xml = d.to_xml(write_files=True)
if options.verbose > 2: if options.verbose > 2:
open('lrs.xml', 'wb').write(xml.encode('utf-8')) open('lrs.xml', 'wb').write(xml.encode('utf-8'))
parser = etree.XMLParser(recover=True, no_network=True) parser = etree.XMLParser(recover=True, no_network=True, huge_tree=True)
doc = etree.fromstring(xml, parser=parser) doc = etree.fromstring(xml, parser=parser)
char_button_map = {} char_button_map = {}
for x in doc.xpath('//CharButton[@refobj]'): for x in doc.xpath('//CharButton[@refobj]'):

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, re, uuid, logging import os, re, uuid, logging, functools
from mimetypes import types_map from mimetypes import types_map
from collections import defaultdict from collections import defaultdict
from itertools import count from itertools import count
@ -26,6 +26,8 @@ from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
from calibre.ebooks.conversion.preprocess import CSSPreProcessor from calibre.ebooks.conversion.preprocess import CSSPreProcessor
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True, huge_tree=True)
XML_NS = 'http://www.w3.org/XML/1998/namespace' XML_NS = 'http://www.w3.org/XML/1998/namespace'
XHTML_NS = 'http://www.w3.org/1999/xhtml' XHTML_NS = 'http://www.w3.org/1999/xhtml'
OEB_DOC_NS = 'http://openebook.org/namespaces/oeb-document/1.0/' OEB_DOC_NS = 'http://openebook.org/namespaces/oeb-document/1.0/'
@ -233,8 +235,6 @@ PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
CSSURL_RE = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''') CSSURL_RE = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''')
RECOVER_PARSER = etree.XMLParser(recover=True)
def element(parent, *args, **kwargs): def element(parent, *args, **kwargs):
if parent is not None: if parent is not None:
@ -780,8 +780,7 @@ class Manifest(object):
assume_utf8=True, resolve_entities=True)[0] assume_utf8=True, resolve_entities=True)[0]
if not data: if not data:
return None return None
parser = etree.XMLParser(recover=True) return etree.fromstring(data, parser=RECOVER_PARSER)
return etree.fromstring(data, parser=parser)
def _parse_xhtml(self, data): def _parse_xhtml(self, data):
self.oeb.log.debug('Parsing', self.href, '...') self.oeb.log.debug('Parsing', self.href, '...')
@ -809,16 +808,17 @@ class Manifest(object):
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys()))) pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
data = pat.sub(lambda m:user_entities[m.group(1)], data) data = pat.sub(lambda m:user_entities[m.group(1)], data)
fromstring = functools.partial(etree.fromstring, parser=RECOVER_PARSER)
# Try with more & more drastic measures to parse # Try with more & more drastic measures to parse
def first_pass(data): def first_pass(data):
try: try:
data = etree.fromstring(data) data = fromstring(data)
except etree.XMLSyntaxError, err: except etree.XMLSyntaxError, err:
self.oeb.log.exception('Initial parse failed:') self.oeb.log.exception('Initial parse failed:')
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
data = ENTITY_RE.sub(repl, data) data = ENTITY_RE.sub(repl, data)
try: try:
data = etree.fromstring(data) data = fromstring(data)
except etree.XMLSyntaxError, err: except etree.XMLSyntaxError, err:
self.oeb.logger.warn('Parsing file %r as HTML' % self.href) self.oeb.logger.warn('Parsing file %r as HTML' % self.href)
if err.args and err.args[0].startswith('Excessive depth'): if err.args and err.args[0].startswith('Excessive depth'):
@ -832,9 +832,9 @@ class Manifest(object):
elem.text = elem.text.strip('-') elem.text = elem.text.strip('-')
data = etree.tostring(data, encoding=unicode) data = etree.tostring(data, encoding=unicode)
try: try:
data = etree.fromstring(data) data = fromstring(data)
except etree.XMLSyntaxError: except etree.XMLSyntaxError:
data = etree.fromstring(data, parser=RECOVER_PARSER) data = fromstring(data)
return data return data
data = first_pass(data) data = first_pass(data)
@ -866,12 +866,12 @@ class Manifest(object):
data = etree.tostring(data, encoding=unicode) data = etree.tostring(data, encoding=unicode)
try: try:
data = etree.fromstring(data) data = fromstring(data)
except: except:
data = data.replace(':=', '=').replace(':>', '>') data = data.replace(':=', '=').replace(':>', '>')
data = data.replace('<http:/>', '') data = data.replace('<http:/>', '')
try: try:
data = etree.fromstring(data) data = fromstring(data)
except etree.XMLSyntaxError: except etree.XMLSyntaxError:
self.oeb.logger.warn('Stripping comments and meta tags from %s'% self.oeb.logger.warn('Stripping comments and meta tags from %s'%
self.href) self.href)
@ -882,7 +882,7 @@ class Manifest(object):
"<?xml version='1.0' encoding='utf-8'?><o:p></o:p>", "<?xml version='1.0' encoding='utf-8'?><o:p></o:p>",
'') '')
data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '') data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
data = etree.fromstring(data) data = fromstring(data)
elif namespace(data.tag) != XHTML_NS: elif namespace(data.tag) != XHTML_NS:
# OEB_DOC_NS, but possibly others # OEB_DOC_NS, but possibly others
ns = namespace(data.tag) ns = namespace(data.tag)