mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #5931 (Calibre does not convert whole LRF to EPUB)
This commit is contained in:
parent
80548645de
commit
5f41fde25f
@ -367,7 +367,7 @@ class LRFInput(InputFormatPlugin):
|
|||||||
xml = d.to_xml(write_files=True)
|
xml = d.to_xml(write_files=True)
|
||||||
if options.verbose > 2:
|
if options.verbose > 2:
|
||||||
open('lrs.xml', 'wb').write(xml.encode('utf-8'))
|
open('lrs.xml', 'wb').write(xml.encode('utf-8'))
|
||||||
parser = etree.XMLParser(recover=True, no_network=True)
|
parser = etree.XMLParser(recover=True, no_network=True, huge_tree=True)
|
||||||
doc = etree.fromstring(xml, parser=parser)
|
doc = etree.fromstring(xml, parser=parser)
|
||||||
char_button_map = {}
|
char_button_map = {}
|
||||||
for x in doc.xpath('//CharButton[@refobj]'):
|
for x in doc.xpath('//CharButton[@refobj]'):
|
||||||
|
@ -7,7 +7,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import os, re, uuid, logging
|
import os, re, uuid, logging, functools
|
||||||
from mimetypes import types_map
|
from mimetypes import types_map
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from itertools import count
|
from itertools import count
|
||||||
@ -26,6 +26,8 @@ from calibre.ebooks.chardet import xml_to_unicode
|
|||||||
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
|
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
|
||||||
from calibre.ebooks.conversion.preprocess import CSSPreProcessor
|
from calibre.ebooks.conversion.preprocess import CSSPreProcessor
|
||||||
|
|
||||||
|
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True, huge_tree=True)
|
||||||
|
|
||||||
XML_NS = 'http://www.w3.org/XML/1998/namespace'
|
XML_NS = 'http://www.w3.org/XML/1998/namespace'
|
||||||
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
||||||
OEB_DOC_NS = 'http://openebook.org/namespaces/oeb-document/1.0/'
|
OEB_DOC_NS = 'http://openebook.org/namespaces/oeb-document/1.0/'
|
||||||
@ -233,8 +235,6 @@ PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
|
|||||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||||
CSSURL_RE = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''')
|
CSSURL_RE = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''')
|
||||||
|
|
||||||
RECOVER_PARSER = etree.XMLParser(recover=True)
|
|
||||||
|
|
||||||
|
|
||||||
def element(parent, *args, **kwargs):
|
def element(parent, *args, **kwargs):
|
||||||
if parent is not None:
|
if parent is not None:
|
||||||
@ -780,8 +780,7 @@ class Manifest(object):
|
|||||||
assume_utf8=True, resolve_entities=True)[0]
|
assume_utf8=True, resolve_entities=True)[0]
|
||||||
if not data:
|
if not data:
|
||||||
return None
|
return None
|
||||||
parser = etree.XMLParser(recover=True)
|
return etree.fromstring(data, parser=RECOVER_PARSER)
|
||||||
return etree.fromstring(data, parser=parser)
|
|
||||||
|
|
||||||
def _parse_xhtml(self, data):
|
def _parse_xhtml(self, data):
|
||||||
self.oeb.log.debug('Parsing', self.href, '...')
|
self.oeb.log.debug('Parsing', self.href, '...')
|
||||||
@ -809,16 +808,17 @@ class Manifest(object):
|
|||||||
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
|
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
|
||||||
data = pat.sub(lambda m:user_entities[m.group(1)], data)
|
data = pat.sub(lambda m:user_entities[m.group(1)], data)
|
||||||
|
|
||||||
|
fromstring = functools.partial(etree.fromstring, parser=RECOVER_PARSER)
|
||||||
# Try with more & more drastic measures to parse
|
# Try with more & more drastic measures to parse
|
||||||
def first_pass(data):
|
def first_pass(data):
|
||||||
try:
|
try:
|
||||||
data = etree.fromstring(data)
|
data = fromstring(data)
|
||||||
except etree.XMLSyntaxError, err:
|
except etree.XMLSyntaxError, err:
|
||||||
self.oeb.log.exception('Initial parse failed:')
|
self.oeb.log.exception('Initial parse failed:')
|
||||||
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
|
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
|
||||||
data = ENTITY_RE.sub(repl, data)
|
data = ENTITY_RE.sub(repl, data)
|
||||||
try:
|
try:
|
||||||
data = etree.fromstring(data)
|
data = fromstring(data)
|
||||||
except etree.XMLSyntaxError, err:
|
except etree.XMLSyntaxError, err:
|
||||||
self.oeb.logger.warn('Parsing file %r as HTML' % self.href)
|
self.oeb.logger.warn('Parsing file %r as HTML' % self.href)
|
||||||
if err.args and err.args[0].startswith('Excessive depth'):
|
if err.args and err.args[0].startswith('Excessive depth'):
|
||||||
@ -832,9 +832,9 @@ class Manifest(object):
|
|||||||
elem.text = elem.text.strip('-')
|
elem.text = elem.text.strip('-')
|
||||||
data = etree.tostring(data, encoding=unicode)
|
data = etree.tostring(data, encoding=unicode)
|
||||||
try:
|
try:
|
||||||
data = etree.fromstring(data)
|
data = fromstring(data)
|
||||||
except etree.XMLSyntaxError:
|
except etree.XMLSyntaxError:
|
||||||
data = etree.fromstring(data, parser=RECOVER_PARSER)
|
data = fromstring(data)
|
||||||
return data
|
return data
|
||||||
data = first_pass(data)
|
data = first_pass(data)
|
||||||
|
|
||||||
@ -866,12 +866,12 @@ class Manifest(object):
|
|||||||
data = etree.tostring(data, encoding=unicode)
|
data = etree.tostring(data, encoding=unicode)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
data = etree.fromstring(data)
|
data = fromstring(data)
|
||||||
except:
|
except:
|
||||||
data = data.replace(':=', '=').replace(':>', '>')
|
data = data.replace(':=', '=').replace(':>', '>')
|
||||||
data = data.replace('<http:/>', '')
|
data = data.replace('<http:/>', '')
|
||||||
try:
|
try:
|
||||||
data = etree.fromstring(data)
|
data = fromstring(data)
|
||||||
except etree.XMLSyntaxError:
|
except etree.XMLSyntaxError:
|
||||||
self.oeb.logger.warn('Stripping comments and meta tags from %s'%
|
self.oeb.logger.warn('Stripping comments and meta tags from %s'%
|
||||||
self.href)
|
self.href)
|
||||||
@ -882,7 +882,7 @@ class Manifest(object):
|
|||||||
"<?xml version='1.0' encoding='utf-8'?><o:p></o:p>",
|
"<?xml version='1.0' encoding='utf-8'?><o:p></o:p>",
|
||||||
'')
|
'')
|
||||||
data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
|
data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
|
||||||
data = etree.fromstring(data)
|
data = fromstring(data)
|
||||||
elif namespace(data.tag) != XHTML_NS:
|
elif namespace(data.tag) != XHTML_NS:
|
||||||
# OEB_DOC_NS, but possibly others
|
# OEB_DOC_NS, but possibly others
|
||||||
ns = namespace(data.tag)
|
ns = namespace(data.tag)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user