mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Greatly improved decoding of metadata block in LRF files. We now handle the incorrectly encoded metadata blocks produced by makelrf 0.3 by assuming a latin1 encoding. Also added additional fields and support for creating a field that does not exist. Metadata blocks are now always written correctly encoded in utf-16 independent of the encoding of the original metadata block.
This commit is contained in:
parent
c25865e76b
commit
cead54c959
@ -27,15 +27,13 @@ to get and set meta information. For example:
|
|||||||
import struct
|
import struct
|
||||||
import array
|
import array
|
||||||
import zlib
|
import zlib
|
||||||
import StringIO
|
|
||||||
import xml.dom.minidom as dom
|
import xml.dom.minidom as dom
|
||||||
from xml.dom.ext import Print
|
|
||||||
|
|
||||||
from libprs500.prstypes import field
|
from libprs500.prstypes import field
|
||||||
|
|
||||||
BYTE = "<B" #: Unsigned char little endian encoded in 1 byte
|
BYTE = "<B" #: Unsigned char little endian encoded in 1 byte
|
||||||
WORD = "<H" #: Unsigned short little endian encoded in 2 bytes
|
WORD = "<H" #: Unsigned short little endian encoded in 2 bytes
|
||||||
DWORD = "<I" #: Unsigned integer little endian encoded in 4 bytes
|
DWORD = "<I" #: Unsigned integer little endian encoded in 4 bytes
|
||||||
QWORD = "<Q" #: Unsigned long long little endian encoded in 8 bytes
|
QWORD = "<Q" #: Unsigned long long little endian encoded in 8 bytes
|
||||||
|
|
||||||
class versioned_field(field):
|
class versioned_field(field):
|
||||||
@ -91,36 +89,68 @@ class xml_field(object):
|
|||||||
Descriptor that gets and sets XML based meta information from an LRF file.
|
Descriptor that gets and sets XML based meta information from an LRF file.
|
||||||
Works for simple XML fields of the form <tagname>data</tagname>
|
Works for simple XML fields of the form <tagname>data</tagname>
|
||||||
"""
|
"""
|
||||||
def __init__(self, tag_name):
|
def __init__(self, tag_name, parent="BookInfo"):
|
||||||
""" @param tag_name: The XML tag whoose data we operate on """
|
"""
|
||||||
|
@param tag_name: The XML tag whose data we operate on
|
||||||
|
@param parent: The tagname of the parent element of C{tag_name}
|
||||||
|
"""
|
||||||
self.tag_name = tag_name
|
self.tag_name = tag_name
|
||||||
|
self.parent = parent
|
||||||
|
|
||||||
def __get__(self, obj, typ=None):
|
def __get__(self, obj, typ=None):
|
||||||
|
""" Return the data in this field or '' if the field is empty """
|
||||||
document = dom.parseString(obj.info)
|
document = dom.parseString(obj.info)
|
||||||
elem = document.getElementsByTagName(self.tag_name)[0]
|
elems = document.getElementsByTagName(self.tag_name)
|
||||||
elem.normalize()
|
if len(elems):
|
||||||
if not elem.hasChildNodes():
|
elem = None
|
||||||
return ""
|
for candidate in elems:
|
||||||
return elem.firstChild.data.strip()
|
if candidate.parentNode.nodeName == self.parent:
|
||||||
|
elem = candidate
|
||||||
|
if elem:
|
||||||
|
elem.normalize()
|
||||||
|
if elem.hasChildNodes():
|
||||||
|
return elem.firstChild.data.strip()
|
||||||
|
return ""
|
||||||
|
|
||||||
def __set__(self, obj, val):
|
def __set__(self, obj, val):
|
||||||
document = dom.parseString(obj.info)
|
document = dom.parseString(obj.info)
|
||||||
elem = document.getElementsByTagName(self.tag_name)[0]
|
def create_elem():
|
||||||
elem.normalize()
|
elem = document.createElement(self.tag_name)
|
||||||
while elem.hasChildNodes():
|
elem.appendChild(dom.Text())
|
||||||
elem.removeChild(elem.lastChild)
|
parent = document.getElementsByTagName(self.parent)[0]
|
||||||
elem.appendChild(dom.Text())
|
parent.appendChild(elem)
|
||||||
|
return elem
|
||||||
|
|
||||||
|
if not val:
|
||||||
|
val = u''
|
||||||
|
if type(val).__name__ != 'unicode':
|
||||||
|
val = unicode(val, 'utf-8')
|
||||||
|
|
||||||
|
elems = document.getElementsByTagName(self.tag_name)
|
||||||
|
elem = None
|
||||||
|
if len(elems):
|
||||||
|
for candidate in elems:
|
||||||
|
if candidate.parentNode.nodeName == self.parent:
|
||||||
|
elem = candidate
|
||||||
|
if not elem:
|
||||||
|
elem = create_elem()
|
||||||
|
else:
|
||||||
|
elem.normalize()
|
||||||
|
while elem.hasChildNodes():
|
||||||
|
elem.removeChild(elem.lastChild)
|
||||||
|
elem.appendChild(dom.Text())
|
||||||
|
else:
|
||||||
|
elem = create_elem()
|
||||||
elem.firstChild.data = val
|
elem.firstChild.data = val
|
||||||
s = StringIO.StringIO()
|
info = document.toxml(encoding='utf-16')
|
||||||
Print(document, s)
|
obj.info = info
|
||||||
obj.info = s.getvalue()
|
|
||||||
s.close()
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.tag_name
|
return self.tag_name
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "XML Field: " + self.tag_name
|
return "XML Field: " + self.tag_name + " in " + self.parent
|
||||||
|
|
||||||
class LRFMetaFile(object):
|
class LRFMetaFile(object):
|
||||||
""" Has properties to read and write all Meta information in a LRF file. """
|
""" Has properties to read and write all Meta information in a LRF file. """
|
||||||
@ -128,36 +158,39 @@ class LRFMetaFile(object):
|
|||||||
LRF_HEADER = u'LRF'.encode('utf-16')[2:]+'\0\0'
|
LRF_HEADER = u'LRF'.encode('utf-16')[2:]+'\0\0'
|
||||||
|
|
||||||
lrf_header = fixed_stringfield(length=8, start=0)
|
lrf_header = fixed_stringfield(length=8, start=0)
|
||||||
version = field(fmt=WORD, start=8)
|
version = field(fmt=WORD, start=8)
|
||||||
xor_key = field(fmt=WORD, start=10)
|
xor_key = field(fmt=WORD, start=10)
|
||||||
root_object_id = field(fmt=DWORD, start=12)
|
root_object_id = field(fmt=DWORD, start=12)
|
||||||
number_of_objets = field(fmt=QWORD, start=16)
|
number_of_objets = field(fmt=QWORD, start=16)
|
||||||
object_index_offset = field(fmt=QWORD, start=24)
|
object_index_offset = field(fmt=QWORD, start=24)
|
||||||
binding = field(fmt=BYTE, start=36)
|
binding = field(fmt=BYTE, start=36)
|
||||||
dpi = field(fmt=WORD, start=38)
|
dpi = field(fmt=WORD, start=38)
|
||||||
width = field(fmt=WORD, start=42)
|
width = field(fmt=WORD, start=42)
|
||||||
height = field(fmt=WORD, start=44)
|
height = field(fmt=WORD, start=44)
|
||||||
color_depth = field(fmt=BYTE, start=46)
|
color_depth = field(fmt=BYTE, start=46)
|
||||||
toc_object_id = field(fmt=DWORD, start=0x44)
|
toc_object_id = field(fmt=DWORD, start=0x44)
|
||||||
toc_object_offset = field(fmt=DWORD, start=0x48)
|
toc_object_offset = field(fmt=DWORD, start=0x48)
|
||||||
compressed_info_size = field(fmt=WORD, start=0x4c)
|
compressed_info_size = field(fmt=WORD, start=0x4c)
|
||||||
thumbnail_type = versioned_field(version, 800, fmt=WORD, start=0x4e)
|
thumbnail_type = versioned_field(version, 800, fmt=WORD, start=0x4e)
|
||||||
thumbnail_size = versioned_field(version, 800, fmt=DWORD, start=0x50)
|
thumbnail_size = versioned_field(version, 800, fmt=DWORD, start=0x50)
|
||||||
uncompressed_info_size = versioned_field(compressed_info_size, 0, \
|
uncompressed_info_size = versioned_field(compressed_info_size, 0, \
|
||||||
fmt=DWORD, start=0x54)
|
fmt=DWORD, start=0x54)
|
||||||
|
|
||||||
title = xml_field("Title")
|
title = xml_field("Title", parent="BookInfo")
|
||||||
author = xml_field("Author")
|
author = xml_field("Author", parent="BookInfo")
|
||||||
book_id = xml_field("BookID")
|
book_id = xml_field("BookID", parent="BookInfo")
|
||||||
publisher = xml_field("Publisher")
|
publisher = xml_field("Publisher", parent="BookInfo")
|
||||||
label = xml_field("Label")
|
label = xml_field("Label", parent="BookInfo")
|
||||||
category = xml_field("Category")
|
category = xml_field("Category", parent="BookInfo")
|
||||||
|
classification = xml_field("Classification", parent="BookInfo")
|
||||||
|
free_text = xml_field("FreeText", parent="BookInfo")
|
||||||
|
|
||||||
language = xml_field("Language")
|
language = xml_field("Language", parent="DocInfo")
|
||||||
creator = xml_field("Creator")
|
creator = xml_field("Creator", parent="DocInfo")
|
||||||
creation_date = xml_field("CreationDate") #: Format is %Y-%m-%d
|
# Format is %Y-%m-%d
|
||||||
producer = xml_field("Producer")
|
creation_date = xml_field("CreationDate", parent="DocInfo")
|
||||||
page = xml_field("Page")
|
producer = xml_field("Producer", parent="DocInfo")
|
||||||
|
page = xml_field("Page", parent="DocInfo")
|
||||||
|
|
||||||
def safe(func):
|
def safe(func):
|
||||||
"""
|
"""
|
||||||
@ -198,20 +231,36 @@ class LRFMetaFile(object):
|
|||||||
|
|
||||||
@safe_property
|
@safe_property
|
||||||
def info():
|
def info():
|
||||||
doc = """ Document meta information in raw XML format """
|
doc = \
|
||||||
|
"""
|
||||||
|
Document meta information in raw XML format as a byte string encoded in
|
||||||
|
utf-16.
|
||||||
|
To set use raw XML in a byte string encoded in utf-16.
|
||||||
|
"""
|
||||||
def fget(self):
|
def fget(self):
|
||||||
if self.compressed_info_size == 0:
|
if self.compressed_info_size == 0:
|
||||||
raise LRFException("This document has no meta info")
|
raise LRFException("This document has no meta info")
|
||||||
size = self.compressed_info_size - 4
|
size = self.compressed_info_size - 4
|
||||||
self._file.seek(self.info_start)
|
self._file.seek(self.info_start)
|
||||||
try:
|
try:
|
||||||
stream = zlib.decompress(self._file.read(size))
|
src = zlib.decompress(self._file.read(size))
|
||||||
if len(stream) != self.uncompressed_info_size:
|
if len(src) != self.uncompressed_info_size:
|
||||||
raise LRFException("Decompression of document meta info\
|
raise LRFException("Decompression of document meta info\
|
||||||
yielded unexpected results")
|
yielded unexpected results")
|
||||||
# Remove null characters from string as in some LRF files
|
candidate = unicode(src, 'utf-16')
|
||||||
# the stream is null-terminated
|
# LRF files produced with makelrf dont have a correctly
|
||||||
return stream.strip().replace('\0', '')
|
# encoded metadata block.
|
||||||
|
# Decoding using latin1 is the most useful for me since I
|
||||||
|
# occassionally read french books.
|
||||||
|
if not u"Info" in candidate:
|
||||||
|
candidate = unicode(src, 'latin1', errors='ignore')
|
||||||
|
if candidate[-1:] == '\0':
|
||||||
|
candidate = candidate[:-1]
|
||||||
|
candidate = dom.parseString(candidate.encode('utf-8')).\
|
||||||
|
toxml(encoding='utf-16')
|
||||||
|
else:
|
||||||
|
candidate = candidate.encode('utf-16')
|
||||||
|
return candidate.strip()
|
||||||
except zlib.error, e:
|
except zlib.error, e:
|
||||||
raise LRFException("Unable to decompress document meta information")
|
raise LRFException("Unable to decompress document meta information")
|
||||||
|
|
||||||
@ -222,6 +271,7 @@ class LRFMetaFile(object):
|
|||||||
self._file.seek(self.info_start)
|
self._file.seek(self.info_start)
|
||||||
self._file.write(stream)
|
self._file.write(stream)
|
||||||
self._file.flush()
|
self._file.flush()
|
||||||
|
|
||||||
return { "fget":fget, "fset":fset, "doc":doc }
|
return { "fget":fget, "fset":fset, "doc":doc }
|
||||||
|
|
||||||
@safe_property
|
@safe_property
|
||||||
|
Loading…
x
Reference in New Issue
Block a user