FB2 Input: Make parsing of malformed FB2 files a little more robust

This commit is contained in:
Kovid Goyal 2010-10-29 12:17:54 -06:00
parent 270f46b041
commit 8fb38c3862
3 changed files with 13 additions and 8 deletions

View File

@ -178,13 +178,13 @@
</xsl:if>
<xsl:if test="$section_toc_id != 'None'">
<xsl:element name="a">
<xsl:attribute name="name">TOC_<xsl:value-of select="$section_toc_id"/></xsl:attribute>
<xsl:attribute name="id">TOC_<xsl:value-of select="$section_toc_id"/></xsl:attribute>
</xsl:element>
</xsl:if>
<a name="TOC_{generate-id()}"></a>
<xsl:if test="@id">
<xsl:element name="a">
<xsl:attribute name="name"><xsl:value-of select="@id"/></xsl:attribute>
<xsl:attribute name="id"><xsl:value-of select="@id"/></xsl:attribute>
</xsl:element>
</xsl:if>
<xsl:apply-templates/>
@ -194,7 +194,7 @@
<xsl:element name="h6">
<xsl:if test="@id">
<xsl:element name="a">
<xsl:attribute name="name"><xsl:value-of select="@id"/></xsl:attribute>
<xsl:attribute name="id"><xsl:value-of select="@id"/></xsl:attribute>
</xsl:element>
</xsl:if>
<xsl:apply-templates/>

View File

@ -40,14 +40,18 @@ class FB2Input(InputFormatPlugin):
accelerators):
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS
from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER
NAMESPACES = {'f':FB2NS, 'l':XLINK_NS}
log.debug('Parsing XML...')
raw = stream.read()
raw = stream.read().replace('\0', '')
try:
doc = etree.fromstring(raw.replace('\0', ''))
doc = etree.fromstring(raw)
except etree.XMLSyntaxError:
doc = etree.fromstring(raw.replace('& ', '&amp;'))
try:
doc = etree.fromstring(raw, parser=RECOVER_PARSER)
except:
doc = etree.fromstring(raw.replace('& ', '&amp;'),
parser=RECOVER_PARSER)
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
css = ''
for s in stylesheets:

View File

@ -22,7 +22,8 @@ def get_metadata(stream):
'xlink':XLINK_NS})
tostring = lambda x : etree.tostring(x, method='text',
encoding=unicode).strip()
root = etree.fromstring(stream.read())
parser = etree.XMLParser(recover=True, no_network=True)
root = etree.fromstring(stream.read(), parser=parser)
authors, author_sort = [], None
for au in XPath('//fb2:author')(root):
fname = lname = author = None