mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Comments.
This commit is contained in:
parent
644335d97b
commit
494c040d36
@ -136,6 +136,9 @@ def decompress_doc(data):
|
|||||||
return ''.join([chr(i) for i in res])
|
return ''.join([chr(i) for i in res])
|
||||||
|
|
||||||
class HeaderRecord(object):
|
class HeaderRecord(object):
|
||||||
|
'''
|
||||||
|
Plucker header. PDB record 0.
|
||||||
|
'''
|
||||||
|
|
||||||
def __init__(self, raw):
|
def __init__(self, raw):
|
||||||
self.uid, = struct.unpack('>H', raw[0:2])
|
self.uid, = struct.unpack('>H', raw[0:2])
|
||||||
@ -144,6 +147,8 @@ class HeaderRecord(object):
|
|||||||
# 1 is DOC compressed
|
# 1 is DOC compressed
|
||||||
self.compression, = struct.unpack('>H', raw[2:4])
|
self.compression, = struct.unpack('>H', raw[2:4])
|
||||||
self.records, = struct.unpack('>H', raw[4:6])
|
self.records, = struct.unpack('>H', raw[4:6])
|
||||||
|
# uid of the first html file. This should link
|
||||||
|
# to other files which in turn may link to others.
|
||||||
self.home_html = None
|
self.home_html = None
|
||||||
|
|
||||||
self.reserved = {}
|
self.reserved = {}
|
||||||
@ -157,6 +162,10 @@ class HeaderRecord(object):
|
|||||||
|
|
||||||
|
|
||||||
class SectionHeader(object):
|
class SectionHeader(object):
|
||||||
|
'''
|
||||||
|
Every sections (record) has this header. It gives
|
||||||
|
details about the section such as it's uid.
|
||||||
|
'''
|
||||||
|
|
||||||
def __init__(self, raw):
|
def __init__(self, raw):
|
||||||
self.uid, = struct.unpack('>H', raw[0:2])
|
self.uid, = struct.unpack('>H', raw[0:2])
|
||||||
@ -167,9 +176,14 @@ class SectionHeader(object):
|
|||||||
|
|
||||||
|
|
||||||
class SectionHeaderText(object):
|
class SectionHeaderText(object):
|
||||||
|
'''
|
||||||
|
Sub header for text records.
|
||||||
|
'''
|
||||||
|
|
||||||
def __init__(self, section_header, raw):
|
def __init__(self, section_header, raw):
|
||||||
|
# The uncompressed size of each paragraph.
|
||||||
self.sizes = []
|
self.sizes = []
|
||||||
|
# Paragraph attributes.
|
||||||
self.attributes = []
|
self.attributes = []
|
||||||
|
|
||||||
for i in xrange(section_header.paragraphs):
|
for i in xrange(section_header.paragraphs):
|
||||||
@ -179,6 +193,19 @@ class SectionHeaderText(object):
|
|||||||
|
|
||||||
|
|
||||||
class SectionMetadata(object):
|
class SectionMetadata(object):
|
||||||
|
'''
|
||||||
|
Metadata.
|
||||||
|
|
||||||
|
This does not store metadata such as title, or author.
|
||||||
|
That metadata would be best retrieved with the PDB (plucker)
|
||||||
|
metdata reader.
|
||||||
|
|
||||||
|
This stores document specific information such as the
|
||||||
|
text encoding.
|
||||||
|
|
||||||
|
Note: There is a default encoding but each text section
|
||||||
|
can be assigned a different encoding.
|
||||||
|
'''
|
||||||
|
|
||||||
def __init__(self, raw):
|
def __init__(self, raw):
|
||||||
self.default_encoding = 'utf-8'
|
self.default_encoding = 'utf-8'
|
||||||
@ -222,6 +249,9 @@ class SectionMetadata(object):
|
|||||||
|
|
||||||
|
|
||||||
class SectionText(object):
|
class SectionText(object):
|
||||||
|
'''
|
||||||
|
Text data. Stores a text section header and the PHTML.
|
||||||
|
'''
|
||||||
|
|
||||||
def __init__(self, section_header, raw):
|
def __init__(self, section_header, raw):
|
||||||
self.header = SectionHeaderText(section_header, raw)
|
self.header = SectionHeaderText(section_header, raw)
|
||||||
@ -229,14 +259,19 @@ class SectionText(object):
|
|||||||
|
|
||||||
|
|
||||||
class SectionCompositeImage(object):
|
class SectionCompositeImage(object):
|
||||||
|
'''
|
||||||
|
A composite image consists of a a 2D array
|
||||||
|
of rows and columns. The entries in the array
|
||||||
|
are uid's.
|
||||||
|
'''
|
||||||
|
|
||||||
def __init__(self, raw):
|
def __init__(self, raw):
|
||||||
self.columns, = struct.unpack('>H', raw[0:2])
|
self.columns, = struct.unpack('>H', raw[0:2])
|
||||||
self.rows, = struct.unpack('>H', raw[2:4])
|
self.rows, = struct.unpack('>H', raw[2:4])
|
||||||
|
|
||||||
# [
|
# [
|
||||||
# row [col, col, col...],
|
# [uid, uid, uid, ...],
|
||||||
# row [col, col, col...],
|
# [uid, uid, uid, ...],
|
||||||
# ...
|
# ...
|
||||||
# ]
|
# ]
|
||||||
#
|
#
|
||||||
@ -275,18 +310,21 @@ class Reader(FormatReader):
|
|||||||
self.owner_id = None
|
self.owner_id = None
|
||||||
self.sections = []
|
self.sections = []
|
||||||
|
|
||||||
|
# The Plucker record0 header
|
||||||
self.header_record = HeaderRecord(header.section_data(0))
|
self.header_record = HeaderRecord(header.section_data(0))
|
||||||
|
|
||||||
for i in range(1, header.num_sections):
|
for i in range(1, header.num_sections):
|
||||||
section_number = i - 1
|
section_number = len(self.sections)
|
||||||
|
# The length of the section header.
|
||||||
|
# Where the actual data in the section starts.
|
||||||
start = 8
|
start = 8
|
||||||
section = None
|
section = None
|
||||||
|
|
||||||
raw_data = header.section_data(i)
|
raw_data = header.section_data(i)
|
||||||
|
# Every sections has a section header.
|
||||||
section_header = SectionHeader(raw_data)
|
section_header = SectionHeader(raw_data)
|
||||||
|
|
||||||
self.uid_section_number[section_header.uid] = section_number
|
# Store sections we care able.
|
||||||
|
|
||||||
if section_header.type in (DATATYPE_PHTML, DATATYPE_PHTML_COMPRESSED):
|
if section_header.type in (DATATYPE_PHTML, DATATYPE_PHTML_COMPRESSED):
|
||||||
self.uid_text_secion_number[section_header.uid] = section_number
|
self.uid_text_secion_number[section_header.uid] = section_number
|
||||||
section = SectionText(section_header, raw_data[start:])
|
section = SectionText(section_header, raw_data[start:])
|
||||||
@ -300,8 +338,13 @@ class Reader(FormatReader):
|
|||||||
self.uid_composite_image_section_number[section_header.uid] = section_number
|
self.uid_composite_image_section_number[section_header.uid] = section_number
|
||||||
section = SectionCompositeImage(raw_data[start:])
|
section = SectionCompositeImage(raw_data[start:])
|
||||||
|
|
||||||
self.sections.append((section_header, section))
|
# Store the section.
|
||||||
|
if section:
|
||||||
|
self.uid_section_number[section_header.uid] = section_number
|
||||||
|
self.sections.append((section_header, section))
|
||||||
|
|
||||||
|
# Store useful information from the metadata section locally
|
||||||
|
# to make access easier.
|
||||||
if self.metadata_section_number:
|
if self.metadata_section_number:
|
||||||
mdata_section = self.sections[self.metadata_section_number][1]
|
mdata_section = self.sections[self.metadata_section_number][1]
|
||||||
for k, v in mdata_section.exceptional_uid_encodings.items():
|
for k, v in mdata_section.exceptional_uid_encodings.items():
|
||||||
@ -309,13 +352,16 @@ class Reader(FormatReader):
|
|||||||
self.default_encoding = mdata_section.default_encoding
|
self.default_encoding = mdata_section.default_encoding
|
||||||
self.owner_id = mdata_section.owner_id
|
self.owner_id = mdata_section.owner_id
|
||||||
|
|
||||||
|
# Get the metadata (tile, author, ...) with the metadata reader.
|
||||||
from calibre.ebooks.metadata.pdb import get_metadata
|
from calibre.ebooks.metadata.pdb import get_metadata
|
||||||
self.mi = get_metadata(stream, False)
|
self.mi = get_metadata(stream, False)
|
||||||
|
|
||||||
def extract_content(self, output_dir):
|
def extract_content(self, output_dir):
|
||||||
# Each text record is independent (unless the continuation
|
# Each text record is independent (unless the continuation
|
||||||
# value is set in the previous record). Put each converted
|
# value is set in the previous record). Put each converted
|
||||||
# text recored into a separate file.
|
# text recored into a separate file. We will reference the
|
||||||
|
# home.html file as the first file and let the HTML input
|
||||||
|
# plugin assemble the order based on hyperlinks.
|
||||||
with CurrentDir(output_dir):
|
with CurrentDir(output_dir):
|
||||||
for uid, num in self.uid_text_secion_number.items():
|
for uid, num in self.uid_text_secion_number.items():
|
||||||
self.log.debug(_('Writing record with uid: %s as %s.html' % (uid, uid)))
|
self.log.debug(_('Writing record with uid: %s as %s.html' % (uid, uid)))
|
||||||
@ -329,8 +375,9 @@ class Reader(FormatReader):
|
|||||||
html += self.process_phtml(section_data.header, d).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
|
html += self.process_phtml(section_data.header, d).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
|
||||||
html += '</body></html>'
|
html += '</body></html>'
|
||||||
htmlf.write(html.encode('utf-8'))
|
htmlf.write(html.encode('utf-8'))
|
||||||
|
|
||||||
images = []
|
# Images.
|
||||||
|
# Cache the image sizes in case they are used by a composite image.
|
||||||
image_sizes = {}
|
image_sizes = {}
|
||||||
if not os.path.exists(os.path.join(output_dir, 'images/')):
|
if not os.path.exists(os.path.join(output_dir, 'images/')):
|
||||||
os.makedirs(os.path.join(output_dir, 'images/'))
|
os.makedirs(os.path.join(output_dir, 'images/'))
|
||||||
@ -359,10 +406,10 @@ class Reader(FormatReader):
|
|||||||
self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid))
|
self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.log.error('Failed to write image with uid %s: %s' % (uid, e))
|
self.log.error('Failed to write image with uid %s: %s' % (uid, e))
|
||||||
images.append('%s.jpg' % uid)
|
|
||||||
else:
|
else:
|
||||||
self.log.error('Failed to write image with uid %s: No data.' % uid)
|
self.log.error('Failed to write image with uid %s: No data.' % uid)
|
||||||
# Composite images.
|
# Composite images.
|
||||||
|
# We're going to use the already compressed .jpg images here.
|
||||||
for uid, num in self.uid_composite_image_section_number.items():
|
for uid, num in self.uid_composite_image_section_number.items():
|
||||||
try:
|
try:
|
||||||
section_header, section_data = self.sections[num]
|
section_header, section_data = self.sections[num]
|
||||||
@ -559,7 +606,10 @@ class Reader(FormatReader):
|
|||||||
# 4 Bytes
|
# 4 Bytes
|
||||||
# alternate image record ID, image record ID
|
# alternate image record ID, image record ID
|
||||||
elif c == 0x5c:
|
elif c == 0x5c:
|
||||||
offset += 4
|
offset += 3
|
||||||
|
uid = struct.unpack('>H', d[offset:offset+2])[0]
|
||||||
|
html += '<img src="images/%s.jpg" />' % uid
|
||||||
|
offset += 1
|
||||||
# Underline text begins
|
# Underline text begins
|
||||||
# 0 Bytes
|
# 0 Bytes
|
||||||
elif c == 0x60:
|
elif c == 0x60:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user