mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Minor cleanups in pdb code
This commit is contained in:
parent
a8ee3491df
commit
624390db05
@ -8,7 +8,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import os, re, sys, struct, zlib
|
import os, re, struct, zlib
|
||||||
|
|
||||||
from calibre import CurrentDir
|
from calibre import CurrentDir
|
||||||
from calibre.ebooks import DRMError
|
from calibre.ebooks import DRMError
|
||||||
@ -30,7 +30,7 @@ class HeaderRecord(object):
|
|||||||
|
|
||||||
def __init__(self, raw):
|
def __init__(self, raw):
|
||||||
self.version, = struct.unpack('>H', raw[0:2])
|
self.version, = struct.unpack('>H', raw[0:2])
|
||||||
self.non_text_offset, = struct.unpack('>H', raw[12:14])
|
self.non_text_offset, = struct.unpack('>H', raw[12:14])
|
||||||
self.footnote_rec, = struct.unpack('>H', raw[28:30])
|
self.footnote_rec, = struct.unpack('>H', raw[28:30])
|
||||||
self.sidebar_rec, = struct.unpack('>H', raw[30:32])
|
self.sidebar_rec, = struct.unpack('>H', raw[30:32])
|
||||||
self.bookmark_offset, = struct.unpack('>H', raw[32:34])
|
self.bookmark_offset, = struct.unpack('>H', raw[32:34])
|
||||||
@ -39,17 +39,17 @@ class HeaderRecord(object):
|
|||||||
self.footnote_offset, = struct.unpack('>H', raw[48:50])
|
self.footnote_offset, = struct.unpack('>H', raw[48:50])
|
||||||
self.sidebar_offset, = struct.unpack('>H', raw[50:52])
|
self.sidebar_offset, = struct.unpack('>H', raw[50:52])
|
||||||
self.last_data_offset, = struct.unpack('>H', raw[52:54])
|
self.last_data_offset, = struct.unpack('>H', raw[52:54])
|
||||||
|
|
||||||
self.num_text_pages = self.non_text_offset - 1
|
self.num_text_pages = self.non_text_offset - 1
|
||||||
self.num_image_pages = self.metadata_offset - self.image_data_offset
|
self.num_image_pages = self.metadata_offset - self.image_data_offset
|
||||||
|
|
||||||
|
|
||||||
class Reader(FormatReader):
|
class Reader(FormatReader):
|
||||||
|
|
||||||
def __init__(self, header, stream, log, encoding=None):
|
def __init__(self, header, stream, log, encoding=None):
|
||||||
self.log = log
|
self.log = log
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
|
|
||||||
self.sections = []
|
self.sections = []
|
||||||
for i in range(header.num_sections):
|
for i in range(header.num_sections):
|
||||||
self.sections.append(header.section_data(i))
|
self.sections.append(header.section_data(i))
|
||||||
@ -61,17 +61,17 @@ class Reader(FormatReader):
|
|||||||
raise DRMError('eReader DRM is not supported.')
|
raise DRMError('eReader DRM is not supported.')
|
||||||
else:
|
else:
|
||||||
raise EreaderError('Unknown book version %i.' % self.header_record.version)
|
raise EreaderError('Unknown book version %i.' % self.header_record.version)
|
||||||
|
|
||||||
def section_data(self, number):
|
def section_data(self, number):
|
||||||
return self.sections[number]
|
return self.sections[number]
|
||||||
|
|
||||||
def decompress_text(self, number):
|
def decompress_text(self, number):
|
||||||
if self.header_record.version == 2:
|
if self.header_record.version == 2:
|
||||||
return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
|
return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
|
||||||
if self.header_record.version == 10:
|
if self.header_record.version == 10:
|
||||||
return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
|
return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
|
||||||
|
|
||||||
|
|
||||||
def get_image(self, number):
|
def get_image(self, number):
|
||||||
if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
|
if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
|
||||||
return 'empty', ''
|
return 'empty', ''
|
||||||
@ -79,7 +79,7 @@ class Reader(FormatReader):
|
|||||||
name = data[4:4+32].strip('\x00')
|
name = data[4:4+32].strip('\x00')
|
||||||
img = data[62:]
|
img = data[62:]
|
||||||
return name, img
|
return name, img
|
||||||
|
|
||||||
def get_text_page(self, number):
|
def get_text_page(self, number):
|
||||||
'''
|
'''
|
||||||
Only palmdoc and zlib compressed are supported. The text is
|
Only palmdoc and zlib compressed are supported. The text is
|
||||||
@ -88,21 +88,21 @@ class Reader(FormatReader):
|
|||||||
'''
|
'''
|
||||||
if number not in range(1, self.header_record.num_text_pages + 1):
|
if number not in range(1, self.header_record.num_text_pages + 1):
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
return self.decompress_text(number)
|
return self.decompress_text(number)
|
||||||
|
|
||||||
def extract_content(self, output_dir):
|
def extract_content(self, output_dir):
|
||||||
output_dir = os.path.abspath(output_dir)
|
output_dir = os.path.abspath(output_dir)
|
||||||
|
|
||||||
if not os.path.exists(output_dir):
|
if not os.path.exists(output_dir):
|
||||||
os.makedirs(output_dir)
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
html = u'<html><head><title></title></head><body>'
|
html = u'<html><head><title></title></head><body>'
|
||||||
|
|
||||||
for i in range(1, self.header_record.num_text_pages + 1):
|
for i in range(1, self.header_record.num_text_pages + 1):
|
||||||
self.log.debug('Extracting text page %i' % i)
|
self.log.debug('Extracting text page %i' % i)
|
||||||
html += pml_to_html(self.get_text_page(i))
|
html += pml_to_html(self.get_text_page(i))
|
||||||
|
|
||||||
if self.header_record.footnote_rec > 0:
|
if self.header_record.footnote_rec > 0:
|
||||||
html += '<br /><h1>%s</h1>' % _('Footnotes')
|
html += '<br /><h1>%s</h1>' % _('Footnotes')
|
||||||
footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
||||||
@ -110,8 +110,8 @@ class Reader(FormatReader):
|
|||||||
self.log.debug('Extracting footnote page %i' % i)
|
self.log.debug('Extracting footnote page %i' % i)
|
||||||
html += '<dl>'
|
html += '<dl>'
|
||||||
html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
|
html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
|
||||||
html += '</dl>'
|
html += '</dl>'
|
||||||
|
|
||||||
if self.header_record.sidebar_rec > 0:
|
if self.header_record.sidebar_rec > 0:
|
||||||
html += '<br /><h1>%s</h1>' % _('Sidebar')
|
html += '<br /><h1>%s</h1>' % _('Sidebar')
|
||||||
sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
||||||
@ -120,9 +120,9 @@ class Reader(FormatReader):
|
|||||||
html += '<dl>'
|
html += '<dl>'
|
||||||
html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i))
|
html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i))
|
||||||
html += '</dl>'
|
html += '</dl>'
|
||||||
|
|
||||||
html += '</body></html>'
|
html += '</body></html>'
|
||||||
|
|
||||||
with CurrentDir(output_dir):
|
with CurrentDir(output_dir):
|
||||||
with open('index.html', 'wb') as index:
|
with open('index.html', 'wb') as index:
|
||||||
self.log.debug('Writing text to index.html')
|
self.log.debug('Writing text to index.html')
|
||||||
@ -138,19 +138,19 @@ class Reader(FormatReader):
|
|||||||
with open(name, 'wb') as imgf:
|
with open(name, 'wb') as imgf:
|
||||||
self.log.debug('Writing image %s to images/' % name)
|
self.log.debug('Writing image %s to images/' % name)
|
||||||
imgf.write(img)
|
imgf.write(img)
|
||||||
|
|
||||||
opf_path = self.create_opf(output_dir, images)
|
opf_path = self.create_opf(output_dir, images)
|
||||||
|
|
||||||
return opf_path
|
return opf_path
|
||||||
|
|
||||||
def create_opf(self, output_dir, images):
|
def create_opf(self, output_dir, images):
|
||||||
mi = MetaInformation(None, None)
|
mi = MetaInformation(None, None)
|
||||||
|
|
||||||
with CurrentDir(output_dir):
|
with CurrentDir(output_dir):
|
||||||
opf = OPFCreator(output_dir, mi)
|
opf = OPFCreator(output_dir, mi)
|
||||||
|
|
||||||
manifest = [('index.html', None)]
|
manifest = [('index.html', None)]
|
||||||
|
|
||||||
for i in images:
|
for i in images:
|
||||||
manifest.append((os.path.join('images/', i), None))
|
manifest.append((os.path.join('images/', i), None))
|
||||||
|
|
||||||
@ -158,21 +158,21 @@ class Reader(FormatReader):
|
|||||||
opf.create_spine(['index.html'])
|
opf.create_spine(['index.html'])
|
||||||
with open('metadata.opf', 'wb') as opffile:
|
with open('metadata.opf', 'wb') as opffile:
|
||||||
opf.render(opffile)
|
opf.render(opffile)
|
||||||
|
|
||||||
return os.path.join(output_dir, 'metadata.opf')
|
return os.path.join(output_dir, 'metadata.opf')
|
||||||
|
|
||||||
def dump_pml(self):
|
def dump_pml(self):
|
||||||
'''
|
'''
|
||||||
This is primarily used for debugging and 3rd party tools to
|
This is primarily used for debugging and 3rd party tools to
|
||||||
get the plm markup that comprises the text in the file.
|
get the plm markup that comprises the text in the file.
|
||||||
'''
|
'''
|
||||||
pml = ''
|
pml = ''
|
||||||
|
|
||||||
for i in range(1, self.header_record.num_text_pages + 1):
|
for i in range(1, self.header_record.num_text_pages + 1):
|
||||||
pml += self.get_text_page(i)
|
pml += self.get_text_page(i)
|
||||||
|
|
||||||
return pml
|
return pml
|
||||||
|
|
||||||
def dump_images(self, output_dir):
|
def dump_images(self, output_dir):
|
||||||
'''
|
'''
|
||||||
This is primarily used for debugging and 3rd party tools to
|
This is primarily used for debugging and 3rd party tools to
|
||||||
@ -181,7 +181,7 @@ class Reader(FormatReader):
|
|||||||
if not os.path.exists(output_dir):
|
if not os.path.exists(output_dir):
|
||||||
os.makedirs(output_dir)
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
with CurrentDir(output_dir):
|
with CurrentDir(output_dir):
|
||||||
for i in range(0, self.header_record.num_image_pages):
|
for i in range(0, self.header_record.num_image_pages):
|
||||||
name, img = self.get_image(self.header_record.image_data_offset + i)
|
name, img = self.get_image(self.header_record.image_data_offset + i)
|
||||||
with open(name, 'wb') as imgf:
|
with open(name, 'wb') as imgf:
|
||||||
|
@ -8,7 +8,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import os, re, struct, time
|
import re, struct, time
|
||||||
|
|
||||||
class PdbHeaderReader(object):
|
class PdbHeaderReader(object):
|
||||||
|
|
||||||
@ -53,7 +53,8 @@ class PdbHeaderReader(object):
|
|||||||
|
|
||||||
start = self.section_offset(number)
|
start = self.section_offset(number)
|
||||||
if number == self.num_sections -1:
|
if number == self.num_sections -1:
|
||||||
end = os.stat(self.stream.name).st_size
|
self.stream.seek(0, 2)
|
||||||
|
end = self.stream.tell()
|
||||||
else:
|
else:
|
||||||
end = self.section_offset(number + 1)
|
end = self.section_offset(number + 1)
|
||||||
self.stream.seek(start)
|
self.stream.seek(start)
|
||||||
@ -65,18 +66,18 @@ class PdbHeaderBuilder(object):
|
|||||||
def __init__(self, identity, title):
|
def __init__(self, identity, title):
|
||||||
self.identity = identity.ljust(3, '\x00')[:8]
|
self.identity = identity.ljust(3, '\x00')[:8]
|
||||||
self.title = re.sub('[^-A-Za-z0-9]+', '_', title).ljust(32, '\x00')[:32]
|
self.title = re.sub('[^-A-Za-z0-9]+', '_', title).ljust(32, '\x00')[:32]
|
||||||
|
|
||||||
def build_header(self, section_lengths, out_stream):
|
def build_header(self, section_lengths, out_stream):
|
||||||
'''
|
'''
|
||||||
section_lengths = Lenght of each section in file.
|
section_lengths = Lenght of each section in file.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
now = int(time.time())
|
now = int(time.time())
|
||||||
nrecords = len(section_lengths)
|
nrecords = len(section_lengths)
|
||||||
|
|
||||||
out_stream.write(self.title + struct.pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0))
|
out_stream.write(self.title + struct.pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0))
|
||||||
out_stream.write(self.identity + struct.pack('>IIH', nrecords, 0, nrecords))
|
out_stream.write(self.identity + struct.pack('>IIH', nrecords, 0, nrecords))
|
||||||
|
|
||||||
offset = 78 + (8 * nrecords) + 2
|
offset = 78 + (8 * nrecords) + 2
|
||||||
for id, record in enumerate(section_lengths):
|
for id, record in enumerate(section_lengths):
|
||||||
out_stream.write(struct.pack('>LBBBB', long(offset), 0, 0, 0, 0))
|
out_stream.write(struct.pack('>LBBBB', long(offset), 0, 0, 0, 0))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user