Fix decoding text. Add internal link support.

This commit is contained in:
John Schember 2011-04-16 14:13:45 -04:00
parent 0f3228e658
commit acaa06de53

View File

@ -263,7 +263,7 @@ class Reader(FormatReader):
elif section_header.type == DATATYPE_METADATA: elif section_header.type == DATATYPE_METADATA:
self.metadata_section_number = section_number self.metadata_section_number = section_number
section = SectionMetadata(raw_data[start:]) section = SectionMetadata(raw_data[start:])
elif section_header.type == DATATYPE_COMPOSITE_IMAGE: #elif section_header.type == DATATYPE_COMPOSITE_IMAGE:
self.sections.append((section_header, section)) self.sections.append((section_header, section))
@ -285,10 +285,10 @@ class Reader(FormatReader):
for uid, num in self.uid_text_secion_number.items(): for uid, num in self.uid_text_secion_number.items():
section_header, section_data = self.sections[num] section_header, section_data = self.sections[num]
if section_header.type == DATATYPE_PHTML: if section_header.type == DATATYPE_PHTML:
html += self.process_phtml(section_data.header, section_data.data.decode(self.get_text_uid_encoding(section_header.uid), 'replace')) html += self.process_phtml(section_data.header, section_data.data)
elif section_header.type == DATATYPE_PHTML_COMPRESSED: elif section_header.type == DATATYPE_PHTML_COMPRESSED:
d = self.decompress_phtml(section_data.data).decode(self.get_text_uid_encoding(section_header.uid), 'replace') d = self.decompress_phtml(section_data.data)
html += self.process_phtml(section_data.header, d) html += self.process_phtml(section_header.uid, section_data.header, d).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
html += '</body></html>' html += '</body></html>'
@ -300,7 +300,6 @@ class Reader(FormatReader):
if not os.path.exists(os.path.join(output_dir, 'images/')): if not os.path.exists(os.path.join(output_dir, 'images/')):
os.makedirs(os.path.join(output_dir, 'images/')) os.makedirs(os.path.join(output_dir, 'images/'))
with CurrentDir(os.path.join(output_dir, 'images/')): with CurrentDir(os.path.join(output_dir, 'images/')):
#im.read('/Users/john/Tmp/plkr/apnx.palm')
for uid, num in self.uid_image_section_number.items(): for uid, num in self.uid_image_section_number.items():
section_header, section_data = self.sections[num] section_header, section_data = self.sections[num]
if section_data: if section_data:
@ -340,10 +339,12 @@ class Reader(FormatReader):
#from calibre.ebooks.compression.palmdoc import decompress_doc #from calibre.ebooks.compression.palmdoc import decompress_doc
return decompress_doc(data) return decompress_doc(data)
def process_phtml(self, sub_header, d): def process_phtml(self, uid, sub_header, d):
html = u'' html = u'<a id="p%s" /><p id="p%s-0">' % (uid, uid)
offset = 0 offset = 0
paragraph_open = False paragraph_open = True
need_set_p_id = False
p_num = 1
paragraph_offsets = [] paragraph_offsets = []
running_offset = 0 running_offset = 0
for size in sub_header.sizes: for size in sub_header.sizes:
@ -352,7 +353,12 @@ class Reader(FormatReader):
while offset < len(d): while offset < len(d):
if not paragraph_open: if not paragraph_open:
html += u'<p>' if need_set_p_id:
html += u'<p id="p%s-%s">' % (uid, p_num)
p_num += 1
need_set_p_id = False
else:
html += u'<p>'
paragraph_open = True paragraph_open = True
c = ord(d[offset]) c = ord(d[offset])
@ -363,26 +369,36 @@ class Reader(FormatReader):
# 2 Bytes # 2 Bytes
# record ID # record ID
if c == 0x0a: if c == 0x0a:
offset += 2 offset += 1
id = struct.unpack('>H', d[offset:offset+2])[0]
html += '<a href="#p%s">' % id
offset += 1
# Targeted page link begins # Targeted page link begins
# 3 Bytes # 3 Bytes
# record ID, target # record ID, target
elif c == 0x0b: elif c == 0x0b:
offset += 3 offset += 3
html += '<a>'
# Paragraph link begins # Paragraph link begins
# 4 Bytes # 4 Bytes
# record ID, paragraph number # record ID, paragraph number
elif c == 0x0c: elif c == 0x0c:
offset += 4 offset += 1
id = struct.unpack('>H', d[offset:offset+2])[0]
offset += 2
pid = struct.unpack('>H', d[offset:offset+2])[0]
html += '<a href="#p%s-%s">' % (id, pid)
offset += 1
# Targeted paragraph link begins # Targeted paragraph link begins
# 5 Bytes # 5 Bytes
# record ID, paragraph number, target # record ID, paragraph number, target
elif c == 0x0d: elif c == 0x0d:
offset += 5 offset += 5
html += '<a>'
# Link ends # Link ends
# 0 Bytes # 0 Bytes
elif c == 0x08: elif c == 0x08:
pass html += '</a>'
# Set font # Set font
# 1 Bytes # 1 Bytes
# font specifier # font specifier
@ -515,10 +531,11 @@ class Reader(FormatReader):
html += unichr(c) html += unichr(c)
offset += 1 offset += 1
if offset in paragraph_offsets: if offset in paragraph_offsets:
need_set_p_id = True
if paragraph_open: if paragraph_open:
html += u'</p>\n' html += u'</p>\n'
paragraph_open = False paragraph_open = False
if paragraph_open: if paragraph_open:
html += u'</p>' html += u'</p>'