Kindle driver: Add an option to turn off sending page number information. Also add an option to use a more accurate but slower algorithm to calculate page numbers

2025-07-09 03:04:10 -04:00 · 2011-02-13 16:43:53 -07:00 · 2011-02-13 16:43:53 -07:00 · 6ee8ffb9ff
commit 6ee8ffb9ff
parent 7873df5073 e0a0c08839
2 changed files with 165 additions and 33 deletions
--- a/src/calibre/devices/kindle/apnx.py
+++ b/src/calibre/devices/kindle/apnx.py
@ -11,44 +11,42 @@ Generates and writes an APNX page mapping file.
 import struct
 import uuid
 from calibre.ebooks.mobi.reader import MobiReader
 from calibre.ebooks.pdb.header import PdbHeaderReader
 from calibre.utils.logging import default_log
 class APNXBuilder(object):
    '''
-    2300 characters of uncompressed text per page. This is
+    Create an APNX file using a pseudo page mapping.
    not meant to map 1 to 1 to a print book but to be a
    close enough measure.
    A test book was chosen and the characters were counted
    on one page. This number was round to 2240 then 60
    characters of markup were added to the total giving
    2300.
    Uncompressed text length is used because it's easily
    accessible in MOBI files (part of the header). Also,
    It's faster to work off of the length then to
    decompress and parse the actual text.
    A better but much more resource intensive and slower
    method to calculate the page length would be to parse
    the uncompressed text. For each paragraph we would
    want to find how many lines it would occupy in a paper
    back book. 70 characters per line and 32 lines per page.
    So divide the number of characters (minus markup) in
    each paragraph by 70. If there are less than 70
    characters in the paragraph then it is 1 line. Then,
    count every 32 lines and mark that location as a page.
    '''
-    def write_apnx(self, mobi_file_path, apnx_path):
+    def write_apnx(self, mobi_file_path, apnx_path, accurate=True):
        # Check that this is really a MOBI file.
        with open(mobi_file_path, 'rb') as mf:
-            phead = PdbHeaderReader(mf)
+            ident = PdbHeaderReader(mf).identity()
-            r0 = phead.section_data(0)
+        if ident != 'BOOKMOBI':
-            text_length = struct.unpack('>I', r0[4:8])[0]
+            raise Exception(_('Not a valid MOBI file. Reports identity of %s' % ident))
-        pages = self.get_pages(text_length)
+        # Get the pages depending on the chosen parser
        pages = []
        if accurate:
            try:
                pages = self.get_pages_accurate(mobi_file_path)
            except:
                # Fall back to the fast parser if we can't
                # use the accurate one. Typically this is
                # due to the file having DRM.
                pages = self.get_pages_fast(mobi_file_path)
        else:
            pages = self.get_pages_fast(mobi_file_path)
        if not pages:
            raise Exception(_('Could not generate page mapping.'))
        # Generate the APNX file from the page mapping.
        apnx = self.generate_apnx(pages)
        # Write the APNX.
        with open(apnx_path, 'wb') as apnxf:
            apnxf.write(apnx)
@ -73,18 +71,126 @@ class APNXBuilder(object):
        apnx += struct.pack('>H', 32)
        apnx += page_header
-        # write page values to apnx
+        # Write page values to APNX.
        for page in pages:
-            apnx += struct.pack('>L', page)
+            apnx += struct.pack('>I', page)
        return apnx
-    def get_pages(self, text_length):
+    def get_pages_fast(self, mobi_file_path):
        '''
        2300 characters of uncompressed text per page. This is
        not meant to map 1 to 1 to a print book but to be a
        close enough measure.
        A test book was chosen and the characters were counted
        on one page. This number was round to 2240 then 60
        characters of markup were added to the total giving
        2300.
        Uncompressed text length is used because it's easily
        accessible in MOBI files (part of the header). Also,
        It's faster to work off of the length then to
        decompress and parse the actual text.
        '''
        text_length = 0
        pages = []
        count = 0
        with open(mobi_file_path, 'rb') as mf:
            phead = PdbHeaderReader(mf)
            r0 = phead.section_data(0)
            text_length = struct.unpack('>I', r0[4:8])[0]
        while count < text_length:
            pages.append(count)
            count += 2300
        return pages
    def get_pages_accurate(self, mobi_file_path):
        '''
        A more accurate but much more resource intensive and slower
        method to calculate the page length.
        Parses the uncompressed text. In an average paper back book
        There are 32 lines per page and a maximum of 70 characters
        per line.
        Each paragraph starts a new line and every 70 characters
        (minus markup) in a paragraph starts a new line. The
        position after every 30 lines will be marked as a new
        page.
        This can be make more accurate by accounting for
        <div class="mbp_pagebreak" /> as a new page marker.
        And <br> elements as an empty line.
        '''
        pages = []
        # Get the MOBI html.
        mr = MobiReader(mobi_file_path, default_log)
        if mr.book_header.encryption_type != 0:
            # DRMed book
            return self.get_pages_fast(mobi_file_path)
        mr.extract_text()
        # States
        in_tag = False
        in_p = False
        check_p = False
        closing = False
        p_char_count = 0
        # Get positions of every line
        # A line is either a paragraph starting
        # or every 70 characters in a paragraph.
        lines = []
        pos = -1
        # We want this to be as fast as possible so we
        # are going to do one pass across the text. re
        # and string functions will parse the text each
        # time they are called.
        #
        # We can can use .lower() here because we are
        # not modifying the text. In this case the case
        # doesn't matter just the absolute character and
        # the position within the stream.
        for c in mr.mobi_html.lower():
            pos += 1
            # Check if we are starting or stopping a p tag.
            if check_p:
                if c == '/':
                    closing = True
                    continue
                elif c == 'p':
                    if closing:
                        in_p = False
                    else:
                        in_p = True
                        lines.append(pos - 2)
                check_p = False
                closing = False
                continue
            if c == '<':
                in_tag = True
                check_p = True
                continue
            elif c == '>':
                in_tag = False
                check_p = False
                continue
            if in_p and not in_tag:
                p_char_count += 1
                if p_char_count == 70:
                    lines.append(pos)
                    p_char_count = 0
        # Every 30 lines is a new page
        for i in xrange(0, len(lines), 32):
            pages.append(lines[i])
        return pages
--- a/src/calibre/devices/kindle/driver.py
+++ b/src/calibre/devices/kindle/driver.py
@ -176,6 +176,28 @@ class KINDLE2(KINDLE):
    PRODUCT_ID = [0x0002, 0x0004]
    BCD        = [0x0100]
    EXTRA_CUSTOMIZATION_MESSAGE = [
        _('Send page number information when sending books') +
            ':::' +
            _('The Kindle 3 and newer versions can use page number information '
              'in MOBI files. With this option, calibre will calculate and send'
              ' this information to the Kindle when uploading MOBI files by'
              ' USB. Note that the page numbers do not correspond to any paper'
              ' book.'),
        _('Use slower but more accurate page number generation') +
            ':::' +
            _('There are two ways to generate the page number information. Using the more accurate '
              'generator will produce pages that correspond better to a printed book. '
              'However, this method is slower and will slow down sending files '
              'to the Kindle.'),
    ]
    EXTRA_CUSTOMIZATION_DEFAULT = [
        True,
        False,
    ]
    OPT_APNX           = 0
    OPT_APNX_ACCURATE  = 1
    def books(self, oncard=None, end_session=True):
        bl = USBMS.books(self, oncard=oncard, end_session=end_session)
        # Read collections information
@ -212,13 +234,17 @@ class KINDLE2(KINDLE):
        '''
        Hijacking this function to write the apnx file.
        '''
-        if not filepath.lower().endswith('.mobi'):
+        opts = self.settings()
        if not opts.extra_customization[self.OPT_APNX]:
            return
        if os.path.splitext(filepath.lower())[1] not in ('.azw', '.mobi', '.prc'):
            return
        apnx_path = '%s.apnx' % os.path.join(path, filename)
        apnx_builder = APNXBuilder()
        try:
-            apnx_builder.write_apnx(filepath, apnx_path)
+            apnx_builder.write_apnx(filepath, apnx_path, accurate=opts.extra_customization[self.OPT_APNX_ACCURATE])
        except:
            print 'Failed to generate APNX'
            import traceback