mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Encode metadata block in UTF-8 thereby working around endianess issues and hopefully fixing #321
This commit is contained in:
parent
e5ce6adad4
commit
5cbe331c07
@ -29,9 +29,12 @@ from shutil import copyfileobj
|
|||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
import xml.dom.minidom as dom
|
import xml.dom.minidom as dom
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
|
from xml.parsers.expat import ParserCreate
|
||||||
|
|
||||||
from libprs500.devices.prs500.prstypes import field
|
from libprs500.devices.prs500.prstypes import field
|
||||||
from libprs500.ebooks.metadata import MetaInformation
|
from libprs500.ebooks.metadata import MetaInformation
|
||||||
|
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||||
|
|
||||||
|
|
||||||
BYTE = "<B" #: Unsigned char little endian encoded in 1 byte
|
BYTE = "<B" #: Unsigned char little endian encoded in 1 byte
|
||||||
WORD = "<H" #: Unsigned short little endian encoded in 2 bytes
|
WORD = "<H" #: Unsigned short little endian encoded in 2 bytes
|
||||||
@ -94,12 +97,7 @@ class xml_attr_field(object):
|
|||||||
|
|
||||||
def __get__(self, obj, typ=None):
|
def __get__(self, obj, typ=None):
|
||||||
""" Return the data in this field or '' if the field is empty """
|
""" Return the data in this field or '' if the field is empty """
|
||||||
try:
|
document = obj.info
|
||||||
document = dom.parseString(obj.info)
|
|
||||||
except Exception, err:
|
|
||||||
print >>sys.stderr, "Could not parse XML:", err
|
|
||||||
print obj.info
|
|
||||||
raise
|
|
||||||
elems = document.getElementsByTagName(self.tag_name)
|
elems = document.getElementsByTagName(self.tag_name)
|
||||||
if len(elems):
|
if len(elems):
|
||||||
elem = None
|
elem = None
|
||||||
@ -113,12 +111,7 @@ class xml_attr_field(object):
|
|||||||
def __set__(self, obj, val):
|
def __set__(self, obj, val):
|
||||||
if val == None:
|
if val == None:
|
||||||
val = ""
|
val = ""
|
||||||
try:
|
document = obj.info
|
||||||
document = dom.parseString(obj.info)
|
|
||||||
except Exception, err:
|
|
||||||
print >>sys.stderr, "Could not parse XML:", err
|
|
||||||
print obj.info
|
|
||||||
raise
|
|
||||||
elems = document.getElementsByTagName(self.tag_name)
|
elems = document.getElementsByTagName(self.tag_name)
|
||||||
if len(elems):
|
if len(elems):
|
||||||
elem = None
|
elem = None
|
||||||
@ -127,8 +120,7 @@ class xml_attr_field(object):
|
|||||||
elem = candidate
|
elem = candidate
|
||||||
if elem:
|
if elem:
|
||||||
elem.setAttribute(self.attr, val)
|
elem.setAttribute(self.attr, val)
|
||||||
info = document.toxml(encoding='utf-16')
|
obj.info = document
|
||||||
obj.info = info
|
|
||||||
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
@ -152,12 +144,7 @@ class xml_field(object):
|
|||||||
|
|
||||||
def __get__(self, obj, typ=None):
|
def __get__(self, obj, typ=None):
|
||||||
""" Return the data in this field or '' if the field is empty """
|
""" Return the data in this field or '' if the field is empty """
|
||||||
try:
|
document = obj.info
|
||||||
document = dom.parseString(obj.info)
|
|
||||||
except Exception, err:
|
|
||||||
print >>sys.stderr, "Could not parse XML:", err
|
|
||||||
print obj.info
|
|
||||||
raise
|
|
||||||
|
|
||||||
elems = document.getElementsByTagName(self.tag_name)
|
elems = document.getElementsByTagName(self.tag_name)
|
||||||
if len(elems):
|
if len(elems):
|
||||||
@ -174,12 +161,7 @@ class xml_field(object):
|
|||||||
def __set__(self, obj, val):
|
def __set__(self, obj, val):
|
||||||
if not val:
|
if not val:
|
||||||
val = ''
|
val = ''
|
||||||
try:
|
document = obj.info
|
||||||
document = dom.parseString(obj.info)
|
|
||||||
except Exception, err:
|
|
||||||
print >>sys.stderr, "Could not parse XML:", err
|
|
||||||
print obj.info
|
|
||||||
raise
|
|
||||||
def create_elem():
|
def create_elem():
|
||||||
elem = document.createElement(self.tag_name)
|
elem = document.createElement(self.tag_name)
|
||||||
parent = document.getElementsByTagName(self.parent)[0]
|
parent = document.getElementsByTagName(self.parent)[0]
|
||||||
@ -206,8 +188,7 @@ class xml_field(object):
|
|||||||
else:
|
else:
|
||||||
elem = create_elem()
|
elem = create_elem()
|
||||||
elem.appendChild(document.createTextNode(val))
|
elem.appendChild(document.createTextNode(val))
|
||||||
info = document.toxml(encoding='utf-16')
|
obj.info = document
|
||||||
obj.info = info
|
|
||||||
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
@ -371,9 +352,8 @@ class LRFMetaFile(object):
|
|||||||
def info():
|
def info():
|
||||||
doc = \
|
doc = \
|
||||||
"""
|
"""
|
||||||
Document meta information in raw XML format as a byte string encoded in
|
Document meta information as a minidom Document object.
|
||||||
utf-16.
|
To set use a minidom document object.
|
||||||
To set use raw XML in a byte string encoded in utf-16.
|
|
||||||
"""
|
"""
|
||||||
def fget(self):
|
def fget(self):
|
||||||
if self.compressed_info_size == 0:
|
if self.compressed_info_size == 0:
|
||||||
@ -381,30 +361,16 @@ class LRFMetaFile(object):
|
|||||||
size = self.compressed_info_size - 4
|
size = self.compressed_info_size - 4
|
||||||
self._file.seek(self.info_start)
|
self._file.seek(self.info_start)
|
||||||
try:
|
try:
|
||||||
src = zlib.decompress(self._file.read(size))
|
src = zlib.decompress(self._file.read(size))
|
||||||
if len(src) != self.uncompressed_info_size:
|
if len(src) != self.uncompressed_info_size:
|
||||||
raise LRFException("Decompression of document meta info\
|
raise LRFException("Decompression of document meta info\
|
||||||
yielded unexpected results")
|
yielded unexpected results")
|
||||||
candidate = unicode(src, 'utf-16', 'replace')
|
return dom.parseString(src)
|
||||||
# LRF files produced with makelrf dont have a correctly
|
|
||||||
# encoded metadata block.
|
|
||||||
# Decoding using latin1 is the most useful for me since I
|
|
||||||
# occassionally read french books.
|
|
||||||
# pdflrf creates invalif metadata blocks
|
|
||||||
candidate = re.compile(u'</Info>.*', re.DOTALL).sub(u'</Info>', candidate)
|
|
||||||
if not u"Info" in candidate:
|
|
||||||
candidate = unicode(src, 'latin1', errors='ignore')
|
|
||||||
if candidate[-1:] == '\0':
|
|
||||||
candidate = candidate[:-1]
|
|
||||||
candidate = dom.parseString(candidate.encode('utf-8')).\
|
|
||||||
toxml(encoding='utf-16').strip()
|
|
||||||
else:
|
|
||||||
candidate = candidate.strip().encode('utf-16')
|
|
||||||
return candidate
|
|
||||||
except zlib.error:
|
except zlib.error:
|
||||||
raise LRFException("Unable to decompress document meta information")
|
raise LRFException("Unable to decompress document meta information")
|
||||||
|
|
||||||
def fset(self, info):
|
def fset(self, document):
|
||||||
|
info = document.toxml('utf-8')
|
||||||
self.uncompressed_info_size = len(info)
|
self.uncompressed_info_size = len(info)
|
||||||
stream = zlib.compress(info)
|
stream = zlib.compress(info)
|
||||||
orig_size = self.compressed_info_size
|
orig_size = self.compressed_info_size
|
||||||
|
Loading…
x
Reference in New Issue
Block a user