From 08f5775f6596f94d02c470447963e3ed320daa15 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jul 2011 10:36:35 -0600 Subject: [PATCH 1/3] ebook-convert: Abort if a keyboard interrupt is raised during parsing --- src/calibre/ebooks/oeb/reader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index 422252f73e..5bb6b193f7 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -163,6 +163,8 @@ class OEBReader(object): if item.media_type in check: try: item.data + except KeyboardInterrupt: + raise except: self.logger.exception('Failed to parse content in %s'% item.href) From 59d9e1558004c53be8fc31b2f1838b1389587d91 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jul 2011 10:53:31 -0600 Subject: [PATCH 2/3] Conversion pipeline: Strip out large blocks of contiguous space (more than 10000 contiguous blanks) as these slow down the conversion process and are almost always indicative of an error in the input document. --- src/calibre/ebooks/conversion/preprocess.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 885d0621e0..751d4f8cd6 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -303,6 +303,9 @@ class CSSPreProcessor(object): class HTMLPreProcessor(object): PREPROCESS = [ + # Remove huge block of contiguous spaces as they slow down + # the following regexes pretty badly + (re.compile(r'\s{10000,}'), lambda m: ''), # Some idiotic HTML generators (Frontpage I'm looking at you) # Put all sorts of crap into . This messes up lxml (re.compile(r']*>\n*(.*?)\n*', re.IGNORECASE|re.DOTALL), From dbefbfbd862b9bcb5d233f428c8451a5c5048a54 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jul 2011 14:06:09 -0600 Subject: [PATCH 3/3] ... --- src/calibre/ebooks/mobi/writer.py | 632 +++++++++++++++--------------- 1 file changed, 321 insertions(+), 311 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index bd61ea559d..bf71ad55c2 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -1642,6 +1642,61 @@ class MobiWriter(object): for record in self._records: self._write(record) + def _clean_text_value(self, text): + if text is not None and text.strip() : + text = text.strip() + if not isinstance(text, unicode): + text = text.decode('utf-8', 'replace') + text = normalize(text).encode('utf-8') + else : + text = "(none)".encode('utf-8') + return text + + def _compute_offset_length(self, i, node, entries) : + h = node.href + if h not in self._id_offsets: + self._oeb.log.warning('Could not find TOC entry:', node.title) + return -1, -1 + + offset = self._id_offsets[h] + length = None + # Calculate length based on next entry's offset + for sibling in entries[i+1:]: + h2 = sibling.href + if h2 in self._id_offsets: + offset2 = self._id_offsets[h2] + if offset2 > offset: + length = offset2 - offset + break + if length is None: + length = self._content_length - offset + return offset, length + + def _establish_document_structure(self) : + documentType = None + try : + klass = self._ctoc_map[0]['klass'] + except : + klass = None + + if klass == 'chapter' or klass == None : + documentType = 'book' + if self.opts.verbose > 2 : + self._oeb.logger.info("Adding a MobiBook to self._MobiDoc") + self._MobiDoc.documentStructure = MobiBook() + + elif klass == 'periodical' : + documentType = klass + if self.opts.verbose > 2 : + self._oeb.logger.info("Adding a MobiPeriodical to self._MobiDoc") + self._MobiDoc.documentStructure = MobiPeriodical(self._MobiDoc.getNextNode()) + self._MobiDoc.documentStructure.startAddress = self._anchor_offset_kindle + else : + raise NotImplementedError('_establish_document_structure: unrecognized klass: %s' % klass) + return documentType + + # Index {{{ + def _generate_index(self): self._oeb.log('Generating INDX ...') self._primary_index_record = None @@ -1815,276 +1870,7 @@ class MobiWriter(object): open(os.path.join(t, n+'.bin'), 'wb').write(self._records[-(i+1)]) self._oeb.log.debug('Index records dumped to', t) - def _clean_text_value(self, text): - if text is not None and text.strip() : - text = text.strip() - if not isinstance(text, unicode): - text = text.decode('utf-8', 'replace') - text = normalize(text).encode('utf-8') - else : - text = "(none)".encode('utf-8') - return text - - def _add_to_ctoc(self, ctoc_str, record_offset): - # Write vwilen + string to ctoc - # Return offset - # Is there enough room for this string in the current ctoc record? - if 0xfbf8 - self._ctoc.tell() < 2 + len(ctoc_str): - # flush this ctoc, start a new one - # print "closing ctoc_record at 0x%X" % self._ctoc.tell() - # print "starting new ctoc with '%-50.50s ...'" % ctoc_str - # pad with 00 - pad = 0xfbf8 - self._ctoc.tell() - # print "padding %d bytes of 00" % pad - self._ctoc.write('\0' * (pad)) - self._ctoc_records.append(self._ctoc.getvalue()) - self._ctoc.truncate(0) - self._ctoc_offset += 0x10000 - record_offset = self._ctoc_offset - - offset = self._ctoc.tell() + record_offset - self._ctoc.write(decint(len(ctoc_str), DECINT_FORWARD) + ctoc_str) - return offset - - def _add_flat_ctoc_node(self, node, ctoc, title=None): - # Process 'chapter' or 'article' nodes only, force either to 'chapter' - t = node.title if title is None else title - t = self._clean_text_value(t) - self._last_toc_entry = t - - # Create an empty dictionary for this node - ctoc_name_map = {} - - # article = chapter - if node.klass == 'article' : - ctoc_name_map['klass'] = 'chapter' - else : - ctoc_name_map['klass'] = node.klass - - # Add title offset to name map - ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) - self._chapterCount += 1 - - # append this node's name_map to map - self._ctoc_map.append(ctoc_name_map) - - return - - def _add_structured_ctoc_node(self, node, ctoc, title=None): - # Process 'periodical', 'section' and 'article' - - # Fetch the offset referencing the current ctoc_record - if node.klass is None : - return - t = node.title if title is None else title - t = self._clean_text_value(t) - self._last_toc_entry = t - - # Create an empty dictionary for this node - ctoc_name_map = {} - - # Add the klass of this node - ctoc_name_map['klass'] = node.klass - - if node.klass == 'chapter': - # Add title offset to name map - ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) - self._chapterCount += 1 - - elif node.klass == 'periodical' : - # Add title offset - ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) - - # Look for existing class entry 'periodical' in _ctoc_map - for entry in self._ctoc_map: - if entry['klass'] == 'periodical': - # Use the pre-existing instance - ctoc_name_map['classOffset'] = entry['classOffset'] - break - else : - continue - else: - # class names should always be in CNCX 0 - no offset - ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0) - - self._periodicalCount += 1 - - elif node.klass == 'section' : - # Add title offset - ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) - - # Look for existing class entry 'section' in _ctoc_map - for entry in self._ctoc_map: - if entry['klass'] == 'section': - # Use the pre-existing instance - ctoc_name_map['classOffset'] = entry['classOffset'] - break - else : - continue - else: - # class names should always be in CNCX 0 - no offset - ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0) - - self._sectionCount += 1 - - elif node.klass == 'article' : - # Add title offset/title - ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) - - # Look for existing class entry 'article' in _ctoc_map - for entry in self._ctoc_map: - if entry['klass'] == 'article': - ctoc_name_map['classOffset'] = entry['classOffset'] - break - else : - continue - else: - # class names should always be in CNCX 0 - no offset - ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0) - - # Add description offset/description - if node.description : - d = self._clean_text_value(node.description) - ctoc_name_map['descriptionOffset'] = self._add_to_ctoc(d, self._ctoc_offset) - else : - ctoc_name_map['descriptionOffset'] = None - - # Add author offset/attribution - if node.author : - a = self._clean_text_value(node.author) - ctoc_name_map['authorOffset'] = self._add_to_ctoc(a, self._ctoc_offset) - else : - ctoc_name_map['authorOffset'] = None - - self._articleCount += 1 - - else : - raise NotImplementedError( \ - 'writer._generate_ctoc.add_node: title: %s has unrecognized klass: %s, playOrder: %d' % \ - (node.title, node.klass, node.play_order)) - - # append this node's name_map to map - self._ctoc_map.append(ctoc_name_map) - - def _generate_ctoc(self): - # Generate the compiled TOC strings - # Each node has 1-4 CTOC entries: - # Periodical (0xDF) - # title, class - # Section (0xFF) - # title, class - # Article (0x3F) - # title, class, description, author - # Chapter (0x0F) - # title, class - # nb: Chapters don't actually have @class, so we synthesize it - # in reader._toc_from_navpoint - - toc = self._oeb.toc - reduced_toc = [] - self._ctoc_map = [] # per node dictionary of {class/title/desc/author} offsets - self._last_toc_entry = None - #ctoc = StringIO() - self._ctoc = StringIO() - - # Track the individual node types - self._periodicalCount = 0 - self._sectionCount = 0 - self._articleCount = 0 - self._chapterCount = 0 - - #first = True - - if self._conforming_periodical_toc : - self._oeb.logger.info('Generating structured CTOC ...') - for (child) in toc.iter(): - if self.opts.verbose > 2 : - self._oeb.logger.info(" %s" % child) - self._add_structured_ctoc_node(child, self._ctoc) - #first = False - - else : - self._oeb.logger.info('Generating flat CTOC ...') - previousOffset = -1 - currentOffset = 0 - for (i, child) in enumerate(toc.iterdescendants()): - # Only add chapters or articles at depth==1 - # no class defaults to 'chapter' - if child.klass is None : child.klass = 'chapter' - if (child.klass == 'article' or child.klass == 'chapter') and child.depth() == 1 : - if self.opts.verbose > 2 : - self._oeb.logger.info("adding (klass:%s depth:%d) %s to flat ctoc" % \ - (child.klass, child.depth(), child) ) - - # Test to see if this child's offset is the same as the previous child's - # offset, skip it - h = child.href - - if h is None: - self._oeb.logger.warn(' Ignoring TOC entry with no href:', - child.title) - continue - if h not in self._id_offsets: - self._oeb.logger.warn(' Ignoring missing TOC entry:', - unicode(child)) - continue - - currentOffset = self._id_offsets[h] - # print "_generate_ctoc: child offset: 0x%X" % currentOffset - - if currentOffset != previousOffset : - self._add_flat_ctoc_node(child, self._ctoc) - reduced_toc.append(child) - previousOffset = currentOffset - else : - self._oeb.logger.warn(" Ignoring redundant href: %s in '%s'" % (h, child.title)) - - else : - if self.opts.verbose > 2 : - self._oeb.logger.info("skipping class: %s depth %d at position %d" % \ - (child.klass, child.depth(),i)) - - # Update the TOC with our edited version - self._oeb.toc.nodes = reduced_toc - - # Instantiate a MobiDocument(mobitype) - if (not self._periodicalCount and not self._sectionCount and not self._articleCount) or \ - not self.opts.mobi_periodical : - mobiType = 0x002 - elif self._periodicalCount: - pt = None - if self._oeb.metadata.publication_type: - x = unicode(self._oeb.metadata.publication_type[0]).split(':') - if len(x) > 1: - pt = x[1] - mobiType = {'newspaper':0x101}.get(pt, 0x103) - else : - raise NotImplementedError('_generate_ctoc: Unrecognized document structured') - - self._MobiDoc = MobiDocument(mobiType) - - if self.opts.verbose > 2 : - structType = 'book' - if mobiType > 0x100 : - structType = 'flat periodical' if mobiType == 0x102 else 'structured periodical' - self._oeb.logger.info("Instantiating a %s MobiDocument of type 0x%X" % (structType, mobiType ) ) - if mobiType > 0x100 : - self._oeb.logger.info("periodicalCount: %d sectionCount: %d articleCount: %d"% \ - (self._periodicalCount, self._sectionCount, self._articleCount) ) - else : - self._oeb.logger.info("chapterCount: %d" % self._chapterCount) - - # Apparently the CTOC must end with a null byte - self._ctoc.write('\0') - - ctoc = self._ctoc.getvalue() - rec_count = len(self._ctoc_records) - self._oeb.logger.info(" CNCX utilization: %d %s %.0f%% full" % \ - (rec_count + 1, 'records, last record' if rec_count else 'record,', - len(ctoc)/655) ) - - return align_block(ctoc) - + # Index nodes {{{ def _write_periodical_node(self, indxt, indices, index, offset, length, count, firstSection, lastSection) : pos = 0xc0 + indxt.tell() indices.write(pack('>H', pos)) # Save the offset for IDXTIndices @@ -2176,48 +1962,8 @@ class MobiWriter(object): indxt.write(decint(self._ctoc_map[index]['titleOffset'], DECINT_FORWARD)) # vwi title offset in CNCX indxt.write(decint(0, DECINT_FORWARD)) # unknown byte - def _compute_offset_length(self, i, node, entries) : - h = node.href - if h not in self._id_offsets: - self._oeb.log.warning('Could not find TOC entry:', node.title) - return -1, -1 + # }}} - offset = self._id_offsets[h] - length = None - # Calculate length based on next entry's offset - for sibling in entries[i+1:]: - h2 = sibling.href - if h2 in self._id_offsets: - offset2 = self._id_offsets[h2] - if offset2 > offset: - length = offset2 - offset - break - if length is None: - length = self._content_length - offset - return offset, length - - def _establish_document_structure(self) : - documentType = None - try : - klass = self._ctoc_map[0]['klass'] - except : - klass = None - - if klass == 'chapter' or klass == None : - documentType = 'book' - if self.opts.verbose > 2 : - self._oeb.logger.info("Adding a MobiBook to self._MobiDoc") - self._MobiDoc.documentStructure = MobiBook() - - elif klass == 'periodical' : - documentType = klass - if self.opts.verbose > 2 : - self._oeb.logger.info("Adding a MobiPeriodical to self._MobiDoc") - self._MobiDoc.documentStructure = MobiPeriodical(self._MobiDoc.getNextNode()) - self._MobiDoc.documentStructure.startAddress = self._anchor_offset_kindle - else : - raise NotImplementedError('_establish_document_structure: unrecognized klass: %s' % klass) - return documentType def _generate_section_indices(self, child, currentSection, myPeriodical, myDoc ) : sectionTitles = list(child.iter())[1:] @@ -2495,6 +2241,270 @@ class MobiWriter(object): last_name, c = self._add_periodical_structured_articles(myDoc, indxt, indices) return align_block(indxt.getvalue()), c, align_block(indices.getvalue()), last_name + # }}} + + # CTOC {{{ + def _add_to_ctoc(self, ctoc_str, record_offset): + # Write vwilen + string to ctoc + # Return offset + # Is there enough room for this string in the current ctoc record? + if 0xfbf8 - self._ctoc.tell() < 2 + len(ctoc_str): + # flush this ctoc, start a new one + # print "closing ctoc_record at 0x%X" % self._ctoc.tell() + # print "starting new ctoc with '%-50.50s ...'" % ctoc_str + # pad with 00 + pad = 0xfbf8 - self._ctoc.tell() + # print "padding %d bytes of 00" % pad + self._ctoc.write('\0' * (pad)) + self._ctoc_records.append(self._ctoc.getvalue()) + self._ctoc.truncate(0) + self._ctoc_offset += 0x10000 + record_offset = self._ctoc_offset + + offset = self._ctoc.tell() + record_offset + self._ctoc.write(decint(len(ctoc_str), DECINT_FORWARD) + ctoc_str) + return offset + + def _add_flat_ctoc_node(self, node, ctoc, title=None): + # Process 'chapter' or 'article' nodes only, force either to 'chapter' + t = node.title if title is None else title + t = self._clean_text_value(t) + self._last_toc_entry = t + + # Create an empty dictionary for this node + ctoc_name_map = {} + + # article = chapter + if node.klass == 'article' : + ctoc_name_map['klass'] = 'chapter' + else : + ctoc_name_map['klass'] = node.klass + + # Add title offset to name map + ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) + self._chapterCount += 1 + + # append this node's name_map to map + self._ctoc_map.append(ctoc_name_map) + + return + + def _add_structured_ctoc_node(self, node, ctoc, title=None): + # Process 'periodical', 'section' and 'article' + + # Fetch the offset referencing the current ctoc_record + if node.klass is None : + return + t = node.title if title is None else title + t = self._clean_text_value(t) + self._last_toc_entry = t + + # Create an empty dictionary for this node + ctoc_name_map = {} + + # Add the klass of this node + ctoc_name_map['klass'] = node.klass + + if node.klass == 'chapter': + # Add title offset to name map + ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) + self._chapterCount += 1 + + elif node.klass == 'periodical' : + # Add title offset + ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) + + # Look for existing class entry 'periodical' in _ctoc_map + for entry in self._ctoc_map: + if entry['klass'] == 'periodical': + # Use the pre-existing instance + ctoc_name_map['classOffset'] = entry['classOffset'] + break + else : + continue + else: + # class names should always be in CNCX 0 - no offset + ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0) + + self._periodicalCount += 1 + + elif node.klass == 'section' : + # Add title offset + ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) + + # Look for existing class entry 'section' in _ctoc_map + for entry in self._ctoc_map: + if entry['klass'] == 'section': + # Use the pre-existing instance + ctoc_name_map['classOffset'] = entry['classOffset'] + break + else : + continue + else: + # class names should always be in CNCX 0 - no offset + ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0) + + self._sectionCount += 1 + + elif node.klass == 'article' : + # Add title offset/title + ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) + + # Look for existing class entry 'article' in _ctoc_map + for entry in self._ctoc_map: + if entry['klass'] == 'article': + ctoc_name_map['classOffset'] = entry['classOffset'] + break + else : + continue + else: + # class names should always be in CNCX 0 - no offset + ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0) + + # Add description offset/description + if node.description : + d = self._clean_text_value(node.description) + ctoc_name_map['descriptionOffset'] = self._add_to_ctoc(d, self._ctoc_offset) + else : + ctoc_name_map['descriptionOffset'] = None + + # Add author offset/attribution + if node.author : + a = self._clean_text_value(node.author) + ctoc_name_map['authorOffset'] = self._add_to_ctoc(a, self._ctoc_offset) + else : + ctoc_name_map['authorOffset'] = None + + self._articleCount += 1 + + else : + raise NotImplementedError( \ + 'writer._generate_ctoc.add_node: title: %s has unrecognized klass: %s, playOrder: %d' % \ + (node.title, node.klass, node.play_order)) + + # append this node's name_map to map + self._ctoc_map.append(ctoc_name_map) + + def _generate_ctoc(self): + # Generate the compiled TOC strings + # Each node has 1-4 CTOC entries: + # Periodical (0xDF) + # title, class + # Section (0xFF) + # title, class + # Article (0x3F) + # title, class, description, author + # Chapter (0x0F) + # title, class + # nb: Chapters don't actually have @class, so we synthesize it + # in reader._toc_from_navpoint + + toc = self._oeb.toc + reduced_toc = [] + self._ctoc_map = [] # per node dictionary of {class/title/desc/author} offsets + self._last_toc_entry = None + #ctoc = StringIO() + self._ctoc = StringIO() + + # Track the individual node types + self._periodicalCount = 0 + self._sectionCount = 0 + self._articleCount = 0 + self._chapterCount = 0 + + #first = True + + if self._conforming_periodical_toc : + self._oeb.logger.info('Generating structured CTOC ...') + for (child) in toc.iter(): + if self.opts.verbose > 2 : + self._oeb.logger.info(" %s" % child) + self._add_structured_ctoc_node(child, self._ctoc) + #first = False + + else : + self._oeb.logger.info('Generating flat CTOC ...') + previousOffset = -1 + currentOffset = 0 + for (i, child) in enumerate(toc.iterdescendants()): + # Only add chapters or articles at depth==1 + # no class defaults to 'chapter' + if child.klass is None : child.klass = 'chapter' + if (child.klass == 'article' or child.klass == 'chapter') and child.depth() == 1 : + if self.opts.verbose > 2 : + self._oeb.logger.info("adding (klass:%s depth:%d) %s to flat ctoc" % \ + (child.klass, child.depth(), child) ) + + # Test to see if this child's offset is the same as the previous child's + # offset, skip it + h = child.href + + if h is None: + self._oeb.logger.warn(' Ignoring TOC entry with no href:', + child.title) + continue + if h not in self._id_offsets: + self._oeb.logger.warn(' Ignoring missing TOC entry:', + unicode(child)) + continue + + currentOffset = self._id_offsets[h] + # print "_generate_ctoc: child offset: 0x%X" % currentOffset + + if currentOffset != previousOffset : + self._add_flat_ctoc_node(child, self._ctoc) + reduced_toc.append(child) + previousOffset = currentOffset + else : + self._oeb.logger.warn(" Ignoring redundant href: %s in '%s'" % (h, child.title)) + + else : + if self.opts.verbose > 2 : + self._oeb.logger.info("skipping class: %s depth %d at position %d" % \ + (child.klass, child.depth(),i)) + + # Update the TOC with our edited version + self._oeb.toc.nodes = reduced_toc + + # Instantiate a MobiDocument(mobitype) + if (not self._periodicalCount and not self._sectionCount and not self._articleCount) or \ + not self.opts.mobi_periodical : + mobiType = 0x002 + elif self._periodicalCount: + pt = None + if self._oeb.metadata.publication_type: + x = unicode(self._oeb.metadata.publication_type[0]).split(':') + if len(x) > 1: + pt = x[1] + mobiType = {'newspaper':0x101}.get(pt, 0x103) + else : + raise NotImplementedError('_generate_ctoc: Unrecognized document structured') + + self._MobiDoc = MobiDocument(mobiType) + + if self.opts.verbose > 2 : + structType = 'book' + if mobiType > 0x100 : + structType = 'flat periodical' if mobiType == 0x102 else 'structured periodical' + self._oeb.logger.info("Instantiating a %s MobiDocument of type 0x%X" % (structType, mobiType ) ) + if mobiType > 0x100 : + self._oeb.logger.info("periodicalCount: %d sectionCount: %d articleCount: %d"% \ + (self._periodicalCount, self._sectionCount, self._articleCount) ) + else : + self._oeb.logger.info("chapterCount: %d" % self._chapterCount) + + # Apparently the CTOC must end with a null byte + self._ctoc.write('\0') + + ctoc = self._ctoc.getvalue() + rec_count = len(self._ctoc_records) + self._oeb.logger.info(" CNCX utilization: %d %s %.0f%% full" % \ + (rec_count + 1, 'records, last record' if rec_count else 'record,', + len(ctoc)/655) ) + + return align_block(ctoc) + + # }}} class HTMLRecordData(object): """ A data structure containing indexing/navigation data for an HTML record """