Add support for reading metadata from LRX files (thanks to igorsk)

This commit is contained in:
Kovid Goyal 2008-12-11 13:47:29 -08:00
parent f152f37ec0
commit 371c1bee5b
2 changed files with 92 additions and 2 deletions

View File

@ -0,0 +1,89 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Read metadata from LRX files
'''
import sys, struct
from zlib import decompress
from lxml import etree
from calibre.ebooks.metadata import MetaInformation, string_to_authors
def _read(f, at, amount):
f.seek(at)
return f.read(amount)
def word_be(buf):
return struct.unpack('>L', buf)[0]
def word_le(buf):
return struct.unpack('<L', buf)[0]
def short_le(buf):
return struct.unpack('<H', buf)[0]
def short_be(buf):
return struct.unpack('>H', buf)[0]
def get_metadata(f):
read = lambda at, amount: _read(f, at, amount)
f.seek(0)
buf = f.read(12)
if buf[4:] == 'ftypLRX2':
offset = 0
while True:
offset += word_be(buf[:4])
try:
buf = read(offset, 8)
except:
raise ValueError('Not a valid LRX file')
if buf[4:] == 'bbeb':
break
offset += 8
buf = read(offset, 16)
if buf[:8].decode('utf-16-le') != 'LRF\x00':
raise ValueError('Not a valid LRX file')
lrf_version = word_le(buf[8:12])
offset += 0x4c
compressed_size = short_le(read(offset, 2))
offset += 2
if lrf_version >= 800:
offset += 6
compressed_size -= 4
uncompressed_size = word_le(read(offset, 4))
info = decompress(f.read(compressed_size))
if len(info) != uncompressed_size:
raise ValueError('LRX file has malformed metadata section')
root = etree.fromstring(info)
bi = root.find('BookInfo')
title = bi.find('Title')
title_sort = title.get('reading', None)
title = title.text
author = bi.find('Author')
author_sort = author.get('reading', None)
mi = MetaInformation(title, string_to_authors(author.text))
mi.title_sort, mi.author_sort = title_sort, author_sort
author = author.text
publisher = bi.find('Publisher')
mi.publisher = getattr(publisher, 'text', None)
mi.tags = [x.text for x in bi.findall('Category')]
mi.language = root.find('DocInfo').find('Language').text
return mi
elif buf[4:8] == 'LRX':
raise ValueError('Librie LRX format not supported')
else:
raise ValueError('Not a LRX file')
def main(args=sys.argv):
print get_metadata(open(args[1], 'rb'))
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -16,6 +16,7 @@ from calibre.ebooks.metadata.epub import get_metadata as epub_metadata
from calibre.ebooks.metadata.html import get_metadata as html_metadata from calibre.ebooks.metadata.html import get_metadata as html_metadata
from calibre.ebooks.mobi.reader import get_metadata as mobi_metadata from calibre.ebooks.mobi.reader import get_metadata as mobi_metadata
from calibre.ebooks.metadata.odt import get_metadata as odt_metadata from calibre.ebooks.metadata.odt import get_metadata as odt_metadata
from calibre.ebooks.metadata.lrx import get_metadata as lrx_metadata
from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.metadata.opf2 import OPF
from calibre.ebooks.metadata.rtf import set_metadata as set_rtf_metadata from calibre.ebooks.metadata.rtf import set_metadata as set_rtf_metadata
from calibre.ebooks.lrf.meta import set_metadata as set_lrf_metadata from calibre.ebooks.lrf.meta import set_metadata as set_lrf_metadata
@ -29,12 +30,12 @@ except OSError:
from calibre.libunzip import extract_member as zip_extract_first from calibre.libunzip import extract_member as zip_extract_first
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
from calibre.ptempfile import TemporaryDirectory
_METADATA_PRIORITIES = [ _METADATA_PRIORITIES = [
'html', 'htm', 'xhtml', 'xhtm', 'html', 'htm', 'xhtml', 'xhtm',
'rtf', 'fb2', 'pdf', 'prc', 'odt', 'rtf', 'fb2', 'pdf', 'prc', 'odt',
'epub', 'lit', 'lrf', 'mobi', 'rb', 'imp' 'epub', 'lit', 'lrx', 'lrf', 'mobi',
'rb', 'imp'
] ]
# The priorities for loading metadata from different file types # The priorities for loading metadata from different file types