From e0a0c08839ebe3b72b20451b76ec0fdcb686e884 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 12 Feb 2011 14:07:32 -0500
Subject: [PATCH] Kindle Interface: APNX add support for accurate generation.
 Add comments and descriptions to APNX file. Add interface options for
 controlling how and if the APNX file is generated.

---
 src/calibre/devices/kindle/apnx.py   | 168 ++++++++++++++++++++++-----
 src/calibre/devices/kindle/driver.py |  29 ++++-
 2 files changed, 164 insertions(+), 33 deletions(-)

diff --git a/src/calibre/devices/kindle/apnx.py b/src/calibre/devices/kindle/apnx.py
index d8dc9709d9..721b86f36f 100644
--- a/src/calibre/devices/kindle/apnx.py
+++ b/src/calibre/devices/kindle/apnx.py
@@ -11,44 +11,43 @@ Generates and writes an APNX page mapping file.
 import struct
 import uuid
 
+from calibre.ebooks import DRMError
+from calibre.ebooks.mobi.reader import MobiReader
 from calibre.ebooks.pdb.header import PdbHeaderReader
+from calibre.utils.logging import default_log
 
 class APNXBuilder(object):
     '''
-    2300 characters of uncompressed text per page. This is
-    not meant to map 1 to 1 to a print book but to be a
-    close enough measure.
-    
-    A test book was chosen and the characters were counted
-    on one page. This number was round to 2240 then 60
-    characters of markup were added to the total giving
-    2300.
-    
-    Uncompressed text length is used because it's easily
-    accessible in MOBI files (part of the header). Also,
-    It's faster to work off of the length then to
-    decompress and parse the actual text.
-    
-    A better but much more resource intensive and slower
-    method to calculate the page length would be to parse
-    the uncompressed text. For each paragraph we would
-    want to find how many lines it would occupy in a paper
-    back book. 70 characters per line and 32 lines per page.
-    So divide the number of characters (minus markup) in
-    each paragraph by 70. If there are less than 70
-    characters in the paragraph then it is 1 line. Then,
-    count every 32 lines and mark that location as a page.
+    Create an APNX file using a pseudo page mapping.
     '''
 
-    def write_apnx(self, mobi_file_path, apnx_path):
+    def write_apnx(self, mobi_file_path, apnx_path, accurate=True):
+        # Check that this is really a MOBI file.
         with open(mobi_file_path, 'rb') as mf:
-            phead = PdbHeaderReader(mf)
-            r0 = phead.section_data(0)
-            text_length = struct.unpack('>I', r0[4:8])[0]
+            ident = PdbHeaderReader(mf).identity()
+        if ident != 'BOOKMOBI':
+            raise Exception(_('Not a valid MOBI file. Reports identity of %s' % ident))
 
-        pages = self.get_pages(text_length)
+        # Get the pages depending on the chosen parser
+        pages = []
+        if accurate:
+            try:
+                pages = self.get_pages_accurate(mobi_file_path)
+            except:
+                # Fall back to the fast parser if we can't
+                # use the accurate one. Typically this is
+                # due to the file having DRM.
+                pages = self.get_pages_fast(mobi_file_path)
+        else:
+            pages = self.get_pages_fast(mobi_file_path)
+        
+        if not pages:
+            raise Exception(_('Could not generate page mapping.'))
+        
+        # Generate the APNX file from the page mapping.
         apnx = self.generate_apnx(pages)
 
+        # Write the APNX.
         with open(apnx_path, 'wb') as apnxf:
             apnxf.write(apnx)
 
@@ -73,18 +72,125 @@ class APNXBuilder(object):
         apnx += struct.pack('>H', 32)
         apnx += page_header
 
-        # write page values to apnx
+        # Write page values to APNX.
         for page in pages:
-            apnx += struct.pack('>L', page)
+            apnx += struct.pack('>I', page)
 
         return apnx
 
-    def get_pages(self, text_length):
+    def get_pages_fast(self, mobi_file_path):
+        '''
+        2300 characters of uncompressed text per page. This is
+        not meant to map 1 to 1 to a print book but to be a
+        close enough measure.
+        
+        A test book was chosen and the characters were counted
+        on one page. This number was round to 2240 then 60
+        characters of markup were added to the total giving
+        2300.
+        
+        Uncompressed text length is used because it's easily
+        accessible in MOBI files (part of the header). Also,
+        It's faster to work off of the length then to
+        decompress and parse the actual text.
+        '''
+        text_length = 0
         pages = []
         count = 0
+        
+        with open(mobi_file_path, 'rb') as mf:
+            phead = PdbHeaderReader(mf)
+            r0 = phead.section_data(0)
+            text_length = struct.unpack('>I', r0[4:8])[0]
 
         while count < text_length:
             pages.append(count)
             count += 2300
 
         return pages
+    
+    def get_pages_accurate(self, mobi_file_path):
+        '''
+        A more accurate but much more resource intensive and slower
+        method to calculate the page length.
+        
+        Parses the uncompressed text. In an average paper back book
+        There are 32 lines per page and a maximum of 70 characters
+        per line.
+        
+        Each paragraph starts a new line and every 70 characters
+        (minus markup) in a paragraph starts a new line. The
+        position after every 30 lines will be marked as a new
+        page.
+        
+        This can be make more accurate by accounting for
+        <div class="mbp_pagebreak" /> as a new page marker.
+        And <br> elements as an empty line.
+        '''
+        pages = []
+        
+        # Get the MOBI html.
+        mr = MobiReader(mobi_file_path, default_log)
+        if mr.book_header.encryption_type != 0:
+            raise DRMError()
+        mr.extract_text()
+        
+        # States
+        in_tag = False
+        in_p = False
+        check_p = False
+        closing = False
+        p_char_count = 0
+        
+        # Get positions of every line
+        # A line is either a paragraph starting
+        # or every 70 characters in a paragraph.
+        lines = []
+        pos = -1
+        # We want this to be as fast as possible so we
+        # are going to do one pass across the text. re
+        # and string functions will parse the text each
+        # time they are called.
+        #
+        # We can can use .lower() here because we are
+        # not modifying the text. In this case the case
+        # doesn't matter just the absolute character and
+        # the position within the stream.
+        for c in mr.mobi_html.lower():
+            pos += 1
+            
+            # Check if we are starting or stopping a p tag.
+            if check_p:
+                if c == '/':
+                    closing = True
+                    continue
+                elif c == 'p':
+                    if closing:
+                        in_p = False
+                    else:
+                        in_p = True
+                        lines.append(pos - 2)
+                check_p = False
+                closing = False
+                continue
+            
+            if c == '<':
+                in_tag = True
+                check_p = True
+                continue
+            elif c == '>':
+                in_tag = False
+                check_p = False
+                continue
+
+            if in_p and not in_tag:
+                p_char_count += 1
+                if p_char_count == 70:
+                    lines.append(pos)
+                    p_char_count = 0
+        
+        # Every 30 lines is a new page
+        for i in xrange(0, len(lines), 32):
+            pages.append(lines[i])
+
+        return pages
diff --git a/src/calibre/devices/kindle/driver.py b/src/calibre/devices/kindle/driver.py
index 5c150eab5a..3f9aa26184 100644
--- a/src/calibre/devices/kindle/driver.py
+++ b/src/calibre/devices/kindle/driver.py
@@ -175,6 +175,27 @@ class KINDLE2(KINDLE):
 
     PRODUCT_ID = [0x0002, 0x0004]
     BCD        = [0x0100]
+    
+    EXTRA_CUSTOMIZATION_MESSAGE = [
+        _('Write page mapping (APNX) file when sending books') +
+            ':::' +
+            _('The APNX page mapping file is a new feature in the Kindle 3\'s '
+              '3.1 firmware. It allows for page numbers to that correspond to pages '
+              'in a print book. This will write an APNX file that uses pseudo page '
+              'numbers based on the the average page length in a paper back book.'),
+        _('Use slower but more accurate APNX generation') +
+            ':::' +
+            _('There are two ways to generate the APNX file. Using the more accurate '
+              'generator will produce pages that correspond better to a printed book. '
+              'However, this method is slower and more intensive. Unchecking this '
+              'option will default to using the faster but less accurate generator.'),
+    ]
+    EXTRA_CUSTOMIZATION_DEFAULT = [
+        True,
+        True,
+    ]
+    OPT_APNX           = 0
+    OPT_APNX_ACCURATE  = 1
 
     def books(self, oncard=None, end_session=True):
         bl = USBMS.books(self, oncard=oncard, end_session=end_session)
@@ -212,13 +233,17 @@ class KINDLE2(KINDLE):
         '''
         Hijacking this function to write the apnx file.
         '''
-        if not filepath.lower().endswith('.mobi'):
+        opts = self.settings()
+        if not opts.extra_customization[self.OPT_APNX]:
+            return
+        
+        if os.path.splitext(filepath.lower())[1] not in ('.azw', '.mobi', '.prc'):
             return
 
         apnx_path = '%s.apnx' % os.path.join(path, filename)
         apnx_builder = APNXBuilder()
         try:
-            apnx_builder.write_apnx(filepath, apnx_path)
+            apnx_builder.write_apnx(filepath, apnx_path, accurate=opts.extra_customization[self.OPT_APNX_ACCURATE])
         except:
             print 'Failed to generate APNX'
             import traceback