From 8557981a51d551907154684b7b16f4d89c56247b Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 16 Apr 2011 14:43:36 -0400 Subject: [PATCH] Don't put every PHTML record into one ordered html file. Plucker documents are groups of separate PHTML pages that are linked via hyperlinks. --- src/calibre/ebooks/pdb/plucker/reader.py | 78 ++++++++++++------------ 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/src/calibre/ebooks/pdb/plucker/reader.py b/src/calibre/ebooks/pdb/plucker/reader.py index 13dea343a7..171c051bbd 100644 --- a/src/calibre/ebooks/pdb/plucker/reader.py +++ b/src/calibre/ebooks/pdb/plucker/reader.py @@ -145,6 +145,7 @@ class HeaderRecord(object): # 1 is DOC compressed self.compression, = struct.unpack('>H', raw[2:4]) self.records, = struct.unpack('>H', raw[4:6]) + self.home_html = None self.reserved = {} for i in xrange(self.records): @@ -152,6 +153,8 @@ class HeaderRecord(object): name, = struct.unpack('>H', raw[6+adv:8+adv]) id, = struct.unpack('>H', raw[8+adv:10+adv]) self.reserved[id] = name + if name == 0: + self.home_html = id class SectionHeader(object): @@ -279,24 +282,21 @@ class Reader(FormatReader): self.mi = get_metadata(stream, False) def extract_content(self, output_dir): - html = u'' - images = [] - - for uid, num in self.uid_text_secion_number.items(): - section_header, section_data = self.sections[num] - if section_header.type == DATATYPE_PHTML: - html += self.process_phtml(section_data.header, section_data.data) - elif section_header.type == DATATYPE_PHTML_COMPRESSED: - d = self.decompress_phtml(section_data.data) - html += self.process_phtml(section_header.uid, section_data.header, d).decode(self.get_text_uid_encoding(section_header.uid), 'replace') - - html += '' - with CurrentDir(output_dir): - with open('index.html', 'wb') as index: - self.log.debug('Writing text to index.html') - index.write(html.encode('utf-8')) + for uid, num in self.uid_text_secion_number.items(): + self.log.debug(_('Writing record with uid: %s as %s.html' % (uid, uid))) + with open('%s.html' % uid, 'wb') as htmlf: + html = u'' + section_header, section_data = self.sections[num] + if section_header.type == DATATYPE_PHTML: + html += self.process_phtml(section_data.header, section_data.data) + elif section_header.type == DATATYPE_PHTML_COMPRESSED: + d = self.decompress_phtml(section_data.data) + html += self.process_phtml(section_data.header, d).decode(self.get_text_uid_encoding(section_header.uid), 'replace') + html += '' + htmlf.write(html.encode('utf-8')) + images = [] if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) with CurrentDir(os.path.join(output_dir, 'images/')): @@ -326,9 +326,25 @@ class Reader(FormatReader): else: self.log.error('Failed to write image with uid %s: No data.' % uid) - opf_path = self.create_opf(output_dir, images) + # Run the HTML through the html processing plugin. + from calibre.customize.ui import plugin_for_input_format + html_input = plugin_for_input_format('html') + for opt in html_input.options: + setattr(self.options, opt.option.name, opt.recommended_value) + self.options.input_encoding = 'utf-8' + odi = self.options.debug_pipeline + self.options.debug_pipeline = None + # Generate oeb from html conversion. + try: + home_html = self.header_record.home_html + if not home_html: + home_html = self.uid_text_secion_number.items()[0][0] + except: + raise Exception(_('Could not determine home.html')) + oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {}) + self.options.debug_pipeline = odi - return opf_path + return oeb def decompress_phtml(self, data): if self.header_record.compression == 2: @@ -339,8 +355,8 @@ class Reader(FormatReader): #from calibre.ebooks.compression.palmdoc import decompress_doc return decompress_doc(data) - def process_phtml(self, uid, sub_header, d): - html = u'

' % (uid, uid) + def process_phtml(self, sub_header, d): + html = u'

' offset = 0 paragraph_open = True need_set_p_id = False @@ -354,7 +370,7 @@ class Reader(FormatReader): while offset < len(d): if not paragraph_open: if need_set_p_id: - html += u'

' % (uid, p_num) + html += u'

' % p_num p_num += 1 need_set_p_id = False else: @@ -371,7 +387,7 @@ class Reader(FormatReader): if c == 0x0a: offset += 1 id = struct.unpack('>H', d[offset:offset+2])[0] - html += '' % id + html += '' % id offset += 1 # Targeted page link begins # 3 Bytes @@ -387,7 +403,7 @@ class Reader(FormatReader): id = struct.unpack('>H', d[offset:offset+2])[0] offset += 2 pid = struct.unpack('>H', d[offset:offset+2])[0] - html += '' % (id, pid) + html += '' % (id, pid) offset += 1 # Targeted paragraph link begins # 5 Bytes @@ -543,19 +559,3 @@ class Reader(FormatReader): def get_text_uid_encoding(self, uid): return self.uid_text_secion_encoding.get(uid, self.default_encoding) - - def create_opf(self, output_dir, images): - with CurrentDir(output_dir): - opf = OPFCreator(output_dir, self.mi) - - manifest = [('index.html', None)] - - for i in images: - manifest.append((os.path.join('images/', i), None)) - - opf.create_manifest(manifest) - opf.create_spine(['index.html']) - with open('metadata.opf', 'wb') as opffile: - opf.render(opffile) - - return os.path.join(output_dir, 'metadata.opf')