Kindle driver: Add an option to turn off sending page number information. Also add an option to use a more accurate but slower algorithm to calculate page numbers

2025-08-30 23:00:21 -04:00 · 2011-02-13 16:43:53 -07:00 · 2011-02-13 16:43:53 -07:00 · 6ee8ffb9ff
commit 6ee8ffb9ff
parent 7873df5073 e0a0c08839
2 changed files with 165 additions and 33 deletions
--- a/src/calibre/devices/kindle/apnx.py
+++ b/src/calibre/devices/kindle/apnx.py
@ -11,44 +11,42 @@ Generates and writes an APNX page mapping file.
 import struct
 import uuid

+from calibre.ebooks.mobi.reader import MobiReader
 from calibre.ebooks.pdb.header import PdbHeaderReader
+from calibre.utils.logging import default_log

 class APNXBuilder(object):
    '''
-    2300 characters of uncompressed text per page. This is
-    not meant to map 1 to 1 to a print book but to be a
-    close enough measure.
-    
-    A test book was chosen and the characters were counted
-    on one page. This number was round to 2240 then 60
-    characters of markup were added to the total giving
-    2300.
-    
-    Uncompressed text length is used because it's easily
-    accessible in MOBI files (part of the header). Also,
-    It's faster to work off of the length then to
-    decompress and parse the actual text.
-    
-    A better but much more resource intensive and slower
-    method to calculate the page length would be to parse
-    the uncompressed text. For each paragraph we would
-    want to find how many lines it would occupy in a paper
-    back book. 70 characters per line and 32 lines per page.
-    So divide the number of characters (minus markup) in
-    each paragraph by 70. If there are less than 70
-    characters in the paragraph then it is 1 line. Then,
-    count every 32 lines and mark that location as a page.
+    Create an APNX file using a pseudo page mapping.
    '''

-    def write_apnx(self, mobi_file_path, apnx_path):
+    def write_apnx(self, mobi_file_path, apnx_path, accurate=True):
+        # Check that this is really a MOBI file.
        with open(mobi_file_path, 'rb') as mf:
-            phead = PdbHeaderReader(mf)
-            r0 = phead.section_data(0)
-            text_length = struct.unpack('>I', r0[4:8])[0]
+            ident = PdbHeaderReader(mf).identity()
+        if ident != 'BOOKMOBI':
+            raise Exception(_('Not a valid MOBI file. Reports identity of %s' % ident))

-        pages = self.get_pages(text_length)
+        # Get the pages depending on the chosen parser
+        pages = []
+        if accurate:
+            try:
+                pages = self.get_pages_accurate(mobi_file_path)
+            except:
+                # Fall back to the fast parser if we can't
+                # use the accurate one. Typically this is
+                # due to the file having DRM.
+                pages = self.get_pages_fast(mobi_file_path)
+        else:
+            pages = self.get_pages_fast(mobi_file_path)
+
+        if not pages:
+            raise Exception(_('Could not generate page mapping.'))
+
+        # Generate the APNX file from the page mapping.
        apnx = self.generate_apnx(pages)

+        # Write the APNX.
        with open(apnx_path, 'wb') as apnxf:
            apnxf.write(apnx)

@ -73,18 +71,126 @@ class APNXBuilder(object):
        apnx += struct.pack('>H', 32)
        apnx += page_header

-        # write page values to apnx
+        # Write page values to APNX.
        for page in pages:
-            apnx += struct.pack('>L', page)
+            apnx += struct.pack('>I', page)

        return apnx

-    def get_pages(self, text_length):
+    def get_pages_fast(self, mobi_file_path):
+        '''
+        2300 characters of uncompressed text per page. This is
+        not meant to map 1 to 1 to a print book but to be a
+        close enough measure.
+
+        A test book was chosen and the characters were counted
+        on one page. This number was round to 2240 then 60
+        characters of markup were added to the total giving
+        2300.
+
+        Uncompressed text length is used because it's easily
+        accessible in MOBI files (part of the header). Also,
+        It's faster to work off of the length then to
+        decompress and parse the actual text.
+        '''
+        text_length = 0
        pages = []
        count = 0

+        with open(mobi_file_path, 'rb') as mf:
+            phead = PdbHeaderReader(mf)
+            r0 = phead.section_data(0)
+            text_length = struct.unpack('>I', r0[4:8])[0]
+
        while count < text_length:
            pages.append(count)
            count += 2300

        return pages
+
+    def get_pages_accurate(self, mobi_file_path):
+        '''
+        A more accurate but much more resource intensive and slower
+        method to calculate the page length.
+
+        Parses the uncompressed text. In an average paper back book
+        There are 32 lines per page and a maximum of 70 characters
+        per line.
+
+        Each paragraph starts a new line and every 70 characters
+        (minus markup) in a paragraph starts a new line. The
+        position after every 30 lines will be marked as a new
+        page.
+
+        This can be make more accurate by accounting for
+        <div class="mbp_pagebreak" /> as a new page marker.
+        And <br> elements as an empty line.
+        '''
+        pages = []
+
+        # Get the MOBI html.
+        mr = MobiReader(mobi_file_path, default_log)
+        if mr.book_header.encryption_type != 0:
+            # DRMed book
+            return self.get_pages_fast(mobi_file_path)
+        mr.extract_text()
+
+        # States
+        in_tag = False
+        in_p = False
+        check_p = False
+        closing = False
+        p_char_count = 0
+
+        # Get positions of every line
+        # A line is either a paragraph starting
+        # or every 70 characters in a paragraph.
+        lines = []
+        pos = -1
+        # We want this to be as fast as possible so we
+        # are going to do one pass across the text. re
+        # and string functions will parse the text each
+        # time they are called.
+        #
+        # We can can use .lower() here because we are
+        # not modifying the text. In this case the case
+        # doesn't matter just the absolute character and
+        # the position within the stream.
+        for c in mr.mobi_html.lower():
+            pos += 1
+
+            # Check if we are starting or stopping a p tag.
+            if check_p:
+                if c == '/':
+                    closing = True
+                    continue
+                elif c == 'p':
+                    if closing:
+                        in_p = False
+                    else:
+                        in_p = True
+                        lines.append(pos - 2)
+                check_p = False
+                closing = False
+                continue
+
+            if c == '<':
+                in_tag = True
+                check_p = True
+                continue
+            elif c == '>':
+                in_tag = False
+                check_p = False
+                continue
+
+            if in_p and not in_tag:
+                p_char_count += 1
+                if p_char_count == 70:
+                    lines.append(pos)
+                    p_char_count = 0
+
+        # Every 30 lines is a new page
+        for i in xrange(0, len(lines), 32):
+            pages.append(lines[i])
+
+        return pages
--- a/src/calibre/devices/kindle/driver.py
+++ b/src/calibre/devices/kindle/driver.py
@ -176,6 +176,28 @@ class KINDLE2(KINDLE):
    PRODUCT_ID = [0x0002, 0x0004]
    BCD        = [0x0100]

+    EXTRA_CUSTOMIZATION_MESSAGE = [
+        _('Send page number information when sending books') +
+            ':::' +
+            _('The Kindle 3 and newer versions can use page number information '
+              'in MOBI files. With this option, calibre will calculate and send'
+              ' this information to the Kindle when uploading MOBI files by'
+              ' USB. Note that the page numbers do not correspond to any paper'
+              ' book.'),
+        _('Use slower but more accurate page number generation') +
+            ':::' +
+            _('There are two ways to generate the page number information. Using the more accurate '
+              'generator will produce pages that correspond better to a printed book. '
+              'However, this method is slower and will slow down sending files '
+              'to the Kindle.'),
+    ]
+    EXTRA_CUSTOMIZATION_DEFAULT = [
+        True,
+        False,
+    ]
+    OPT_APNX           = 0
+    OPT_APNX_ACCURATE  = 1
+
    def books(self, oncard=None, end_session=True):
        bl = USBMS.books(self, oncard=oncard, end_session=end_session)
        # Read collections information
@ -212,13 +234,17 @@ class KINDLE2(KINDLE):
        '''
        Hijacking this function to write the apnx file.
        '''
-        if not filepath.lower().endswith('.mobi'):
+        opts = self.settings()
+        if not opts.extra_customization[self.OPT_APNX]:
+            return
+
+        if os.path.splitext(filepath.lower())[1] not in ('.azw', '.mobi', '.prc'):
            return

        apnx_path = '%s.apnx' % os.path.join(path, filename)
        apnx_builder = APNXBuilder()
        try:
-            apnx_builder.write_apnx(filepath, apnx_path)
+            apnx_builder.write_apnx(filepath, apnx_path, accurate=opts.extra_customization[self.OPT_APNX_ACCURATE])
        except:
            print 'Failed to generate APNX'
            import traceback