Comments.

This commit is contained in:
John Schember 2011-04-16 21:54:28 -04:00
parent 644335d97b
commit 494c040d36

View File

@ -136,6 +136,9 @@ def decompress_doc(data):
return ''.join([chr(i) for i in res]) return ''.join([chr(i) for i in res])
class HeaderRecord(object): class HeaderRecord(object):
'''
Plucker header. PDB record 0.
'''
def __init__(self, raw): def __init__(self, raw):
self.uid, = struct.unpack('>H', raw[0:2]) self.uid, = struct.unpack('>H', raw[0:2])
@ -144,6 +147,8 @@ class HeaderRecord(object):
# 1 is DOC compressed # 1 is DOC compressed
self.compression, = struct.unpack('>H', raw[2:4]) self.compression, = struct.unpack('>H', raw[2:4])
self.records, = struct.unpack('>H', raw[4:6]) self.records, = struct.unpack('>H', raw[4:6])
# uid of the first html file. This should link
# to other files which in turn may link to others.
self.home_html = None self.home_html = None
self.reserved = {} self.reserved = {}
@ -157,6 +162,10 @@ class HeaderRecord(object):
class SectionHeader(object): class SectionHeader(object):
'''
Every sections (record) has this header. It gives
details about the section such as it's uid.
'''
def __init__(self, raw): def __init__(self, raw):
self.uid, = struct.unpack('>H', raw[0:2]) self.uid, = struct.unpack('>H', raw[0:2])
@ -167,9 +176,14 @@ class SectionHeader(object):
class SectionHeaderText(object): class SectionHeaderText(object):
'''
Sub header for text records.
'''
def __init__(self, section_header, raw): def __init__(self, section_header, raw):
# The uncompressed size of each paragraph.
self.sizes = [] self.sizes = []
# Paragraph attributes.
self.attributes = [] self.attributes = []
for i in xrange(section_header.paragraphs): for i in xrange(section_header.paragraphs):
@ -179,6 +193,19 @@ class SectionHeaderText(object):
class SectionMetadata(object): class SectionMetadata(object):
'''
Metadata.
This does not store metadata such as title, or author.
That metadata would be best retrieved with the PDB (plucker)
metdata reader.
This stores document specific information such as the
text encoding.
Note: There is a default encoding but each text section
can be assigned a different encoding.
'''
def __init__(self, raw): def __init__(self, raw):
self.default_encoding = 'utf-8' self.default_encoding = 'utf-8'
@ -222,6 +249,9 @@ class SectionMetadata(object):
class SectionText(object): class SectionText(object):
'''
Text data. Stores a text section header and the PHTML.
'''
def __init__(self, section_header, raw): def __init__(self, section_header, raw):
self.header = SectionHeaderText(section_header, raw) self.header = SectionHeaderText(section_header, raw)
@ -229,14 +259,19 @@ class SectionText(object):
class SectionCompositeImage(object): class SectionCompositeImage(object):
'''
A composite image consists of a a 2D array
of rows and columns. The entries in the array
are uid's.
'''
def __init__(self, raw): def __init__(self, raw):
self.columns, = struct.unpack('>H', raw[0:2]) self.columns, = struct.unpack('>H', raw[0:2])
self.rows, = struct.unpack('>H', raw[2:4]) self.rows, = struct.unpack('>H', raw[2:4])
# [ # [
# row [col, col, col...], # [uid, uid, uid, ...],
# row [col, col, col...], # [uid, uid, uid, ...],
# ... # ...
# ] # ]
# #
@ -275,18 +310,21 @@ class Reader(FormatReader):
self.owner_id = None self.owner_id = None
self.sections = [] self.sections = []
# The Plucker record0 header
self.header_record = HeaderRecord(header.section_data(0)) self.header_record = HeaderRecord(header.section_data(0))
for i in range(1, header.num_sections): for i in range(1, header.num_sections):
section_number = i - 1 section_number = len(self.sections)
# The length of the section header.
# Where the actual data in the section starts.
start = 8 start = 8
section = None section = None
raw_data = header.section_data(i) raw_data = header.section_data(i)
# Every sections has a section header.
section_header = SectionHeader(raw_data) section_header = SectionHeader(raw_data)
self.uid_section_number[section_header.uid] = section_number # Store sections we care able.
if section_header.type in (DATATYPE_PHTML, DATATYPE_PHTML_COMPRESSED): if section_header.type in (DATATYPE_PHTML, DATATYPE_PHTML_COMPRESSED):
self.uid_text_secion_number[section_header.uid] = section_number self.uid_text_secion_number[section_header.uid] = section_number
section = SectionText(section_header, raw_data[start:]) section = SectionText(section_header, raw_data[start:])
@ -300,8 +338,13 @@ class Reader(FormatReader):
self.uid_composite_image_section_number[section_header.uid] = section_number self.uid_composite_image_section_number[section_header.uid] = section_number
section = SectionCompositeImage(raw_data[start:]) section = SectionCompositeImage(raw_data[start:])
self.sections.append((section_header, section)) # Store the section.
if section:
self.uid_section_number[section_header.uid] = section_number
self.sections.append((section_header, section))
# Store useful information from the metadata section locally
# to make access easier.
if self.metadata_section_number: if self.metadata_section_number:
mdata_section = self.sections[self.metadata_section_number][1] mdata_section = self.sections[self.metadata_section_number][1]
for k, v in mdata_section.exceptional_uid_encodings.items(): for k, v in mdata_section.exceptional_uid_encodings.items():
@ -309,13 +352,16 @@ class Reader(FormatReader):
self.default_encoding = mdata_section.default_encoding self.default_encoding = mdata_section.default_encoding
self.owner_id = mdata_section.owner_id self.owner_id = mdata_section.owner_id
# Get the metadata (tile, author, ...) with the metadata reader.
from calibre.ebooks.metadata.pdb import get_metadata from calibre.ebooks.metadata.pdb import get_metadata
self.mi = get_metadata(stream, False) self.mi = get_metadata(stream, False)
def extract_content(self, output_dir): def extract_content(self, output_dir):
# Each text record is independent (unless the continuation # Each text record is independent (unless the continuation
# value is set in the previous record). Put each converted # value is set in the previous record). Put each converted
# text recored into a separate file. # text recored into a separate file. We will reference the
# home.html file as the first file and let the HTML input
# plugin assemble the order based on hyperlinks.
with CurrentDir(output_dir): with CurrentDir(output_dir):
for uid, num in self.uid_text_secion_number.items(): for uid, num in self.uid_text_secion_number.items():
self.log.debug(_('Writing record with uid: %s as %s.html' % (uid, uid))) self.log.debug(_('Writing record with uid: %s as %s.html' % (uid, uid)))
@ -329,8 +375,9 @@ class Reader(FormatReader):
html += self.process_phtml(section_data.header, d).decode(self.get_text_uid_encoding(section_header.uid), 'replace') html += self.process_phtml(section_data.header, d).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
html += '</body></html>' html += '</body></html>'
htmlf.write(html.encode('utf-8')) htmlf.write(html.encode('utf-8'))
images = [] # Images.
# Cache the image sizes in case they are used by a composite image.
image_sizes = {} image_sizes = {}
if not os.path.exists(os.path.join(output_dir, 'images/')): if not os.path.exists(os.path.join(output_dir, 'images/')):
os.makedirs(os.path.join(output_dir, 'images/')) os.makedirs(os.path.join(output_dir, 'images/'))
@ -359,10 +406,10 @@ class Reader(FormatReader):
self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid)) self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid))
except Exception as e: except Exception as e:
self.log.error('Failed to write image with uid %s: %s' % (uid, e)) self.log.error('Failed to write image with uid %s: %s' % (uid, e))
images.append('%s.jpg' % uid)
else: else:
self.log.error('Failed to write image with uid %s: No data.' % uid) self.log.error('Failed to write image with uid %s: No data.' % uid)
# Composite images. # Composite images.
# We're going to use the already compressed .jpg images here.
for uid, num in self.uid_composite_image_section_number.items(): for uid, num in self.uid_composite_image_section_number.items():
try: try:
section_header, section_data = self.sections[num] section_header, section_data = self.sections[num]
@ -559,7 +606,10 @@ class Reader(FormatReader):
# 4 Bytes # 4 Bytes
# alternate image record ID, image record ID # alternate image record ID, image record ID
elif c == 0x5c: elif c == 0x5c:
offset += 4 offset += 3
uid = struct.unpack('>H', d[offset:offset+2])[0]
html += '<img src="images/%s.jpg" />' % uid
offset += 1
# Underline text begins # Underline text begins
# 0 Bytes # 0 Bytes
elif c == 0x60: elif c == 0x60: