From e0a0c08839ebe3b72b20451b76ec0fdcb686e884 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 12 Feb 2011 14:07:32 -0500 Subject: [PATCH] Kindle Interface: APNX add support for accurate generation. Add comments and descriptions to APNX file. Add interface options for controlling how and if the APNX file is generated. --- src/calibre/devices/kindle/apnx.py | 168 ++++++++++++++++++++++----- src/calibre/devices/kindle/driver.py | 29 ++++- 2 files changed, 164 insertions(+), 33 deletions(-) diff --git a/src/calibre/devices/kindle/apnx.py b/src/calibre/devices/kindle/apnx.py index d8dc9709d9..721b86f36f 100644 --- a/src/calibre/devices/kindle/apnx.py +++ b/src/calibre/devices/kindle/apnx.py @@ -11,44 +11,43 @@ Generates and writes an APNX page mapping file. import struct import uuid +from calibre.ebooks import DRMError +from calibre.ebooks.mobi.reader import MobiReader from calibre.ebooks.pdb.header import PdbHeaderReader +from calibre.utils.logging import default_log class APNXBuilder(object): ''' - 2300 characters of uncompressed text per page. This is - not meant to map 1 to 1 to a print book but to be a - close enough measure. - - A test book was chosen and the characters were counted - on one page. This number was round to 2240 then 60 - characters of markup were added to the total giving - 2300. - - Uncompressed text length is used because it's easily - accessible in MOBI files (part of the header). Also, - It's faster to work off of the length then to - decompress and parse the actual text. - - A better but much more resource intensive and slower - method to calculate the page length would be to parse - the uncompressed text. For each paragraph we would - want to find how many lines it would occupy in a paper - back book. 70 characters per line and 32 lines per page. - So divide the number of characters (minus markup) in - each paragraph by 70. If there are less than 70 - characters in the paragraph then it is 1 line. Then, - count every 32 lines and mark that location as a page. + Create an APNX file using a pseudo page mapping. ''' - def write_apnx(self, mobi_file_path, apnx_path): + def write_apnx(self, mobi_file_path, apnx_path, accurate=True): + # Check that this is really a MOBI file. with open(mobi_file_path, 'rb') as mf: - phead = PdbHeaderReader(mf) - r0 = phead.section_data(0) - text_length = struct.unpack('>I', r0[4:8])[0] + ident = PdbHeaderReader(mf).identity() + if ident != 'BOOKMOBI': + raise Exception(_('Not a valid MOBI file. Reports identity of %s' % ident)) - pages = self.get_pages(text_length) + # Get the pages depending on the chosen parser + pages = [] + if accurate: + try: + pages = self.get_pages_accurate(mobi_file_path) + except: + # Fall back to the fast parser if we can't + # use the accurate one. Typically this is + # due to the file having DRM. + pages = self.get_pages_fast(mobi_file_path) + else: + pages = self.get_pages_fast(mobi_file_path) + + if not pages: + raise Exception(_('Could not generate page mapping.')) + + # Generate the APNX file from the page mapping. apnx = self.generate_apnx(pages) + # Write the APNX. with open(apnx_path, 'wb') as apnxf: apnxf.write(apnx) @@ -73,18 +72,125 @@ class APNXBuilder(object): apnx += struct.pack('>H', 32) apnx += page_header - # write page values to apnx + # Write page values to APNX. for page in pages: - apnx += struct.pack('>L', page) + apnx += struct.pack('>I', page) return apnx - def get_pages(self, text_length): + def get_pages_fast(self, mobi_file_path): + ''' + 2300 characters of uncompressed text per page. This is + not meant to map 1 to 1 to a print book but to be a + close enough measure. + + A test book was chosen and the characters were counted + on one page. This number was round to 2240 then 60 + characters of markup were added to the total giving + 2300. + + Uncompressed text length is used because it's easily + accessible in MOBI files (part of the header). Also, + It's faster to work off of the length then to + decompress and parse the actual text. + ''' + text_length = 0 pages = [] count = 0 + + with open(mobi_file_path, 'rb') as mf: + phead = PdbHeaderReader(mf) + r0 = phead.section_data(0) + text_length = struct.unpack('>I', r0[4:8])[0] while count < text_length: pages.append(count) count += 2300 return pages + + def get_pages_accurate(self, mobi_file_path): + ''' + A more accurate but much more resource intensive and slower + method to calculate the page length. + + Parses the uncompressed text. In an average paper back book + There are 32 lines per page and a maximum of 70 characters + per line. + + Each paragraph starts a new line and every 70 characters + (minus markup) in a paragraph starts a new line. The + position after every 30 lines will be marked as a new + page. + + This can be make more accurate by accounting for +
as a new page marker. + And
elements as an empty line. + ''' + pages = [] + + # Get the MOBI html. + mr = MobiReader(mobi_file_path, default_log) + if mr.book_header.encryption_type != 0: + raise DRMError() + mr.extract_text() + + # States + in_tag = False + in_p = False + check_p = False + closing = False + p_char_count = 0 + + # Get positions of every line + # A line is either a paragraph starting + # or every 70 characters in a paragraph. + lines = [] + pos = -1 + # We want this to be as fast as possible so we + # are going to do one pass across the text. re + # and string functions will parse the text each + # time they are called. + # + # We can can use .lower() here because we are + # not modifying the text. In this case the case + # doesn't matter just the absolute character and + # the position within the stream. + for c in mr.mobi_html.lower(): + pos += 1 + + # Check if we are starting or stopping a p tag. + if check_p: + if c == '/': + closing = True + continue + elif c == 'p': + if closing: + in_p = False + else: + in_p = True + lines.append(pos - 2) + check_p = False + closing = False + continue + + if c == '<': + in_tag = True + check_p = True + continue + elif c == '>': + in_tag = False + check_p = False + continue + + if in_p and not in_tag: + p_char_count += 1 + if p_char_count == 70: + lines.append(pos) + p_char_count = 0 + + # Every 30 lines is a new page + for i in xrange(0, len(lines), 32): + pages.append(lines[i]) + + return pages diff --git a/src/calibre/devices/kindle/driver.py b/src/calibre/devices/kindle/driver.py index 5c150eab5a..3f9aa26184 100644 --- a/src/calibre/devices/kindle/driver.py +++ b/src/calibre/devices/kindle/driver.py @@ -175,6 +175,27 @@ class KINDLE2(KINDLE): PRODUCT_ID = [0x0002, 0x0004] BCD = [0x0100] + + EXTRA_CUSTOMIZATION_MESSAGE = [ + _('Write page mapping (APNX) file when sending books') + + ':::' + + _('The APNX page mapping file is a new feature in the Kindle 3\'s ' + '3.1 firmware. It allows for page numbers to that correspond to pages ' + 'in a print book. This will write an APNX file that uses pseudo page ' + 'numbers based on the the average page length in a paper back book.'), + _('Use slower but more accurate APNX generation') + + ':::' + + _('There are two ways to generate the APNX file. Using the more accurate ' + 'generator will produce pages that correspond better to a printed book. ' + 'However, this method is slower and more intensive. Unchecking this ' + 'option will default to using the faster but less accurate generator.'), + ] + EXTRA_CUSTOMIZATION_DEFAULT = [ + True, + True, + ] + OPT_APNX = 0 + OPT_APNX_ACCURATE = 1 def books(self, oncard=None, end_session=True): bl = USBMS.books(self, oncard=oncard, end_session=end_session) @@ -212,13 +233,17 @@ class KINDLE2(KINDLE): ''' Hijacking this function to write the apnx file. ''' - if not filepath.lower().endswith('.mobi'): + opts = self.settings() + if not opts.extra_customization[self.OPT_APNX]: + return + + if os.path.splitext(filepath.lower())[1] not in ('.azw', '.mobi', '.prc'): return apnx_path = '%s.apnx' % os.path.join(path, filename) apnx_builder = APNXBuilder() try: - apnx_builder.write_apnx(filepath, apnx_path) + apnx_builder.write_apnx(filepath, apnx_path, accurate=opts.extra_customization[self.OPT_APNX_ACCURATE]) except: print 'Failed to generate APNX' import traceback