diff --git a/src/calibre/devices/kindle/apnx.py b/src/calibre/devices/kindle/apnx.py index d8dc9709d9..c98fe7a7fa 100644 --- a/src/calibre/devices/kindle/apnx.py +++ b/src/calibre/devices/kindle/apnx.py @@ -11,44 +11,42 @@ Generates and writes an APNX page mapping file. import struct import uuid +from calibre.ebooks.mobi.reader import MobiReader from calibre.ebooks.pdb.header import PdbHeaderReader +from calibre.utils.logging import default_log class APNXBuilder(object): ''' - 2300 characters of uncompressed text per page. This is - not meant to map 1 to 1 to a print book but to be a - close enough measure. - - A test book was chosen and the characters were counted - on one page. This number was round to 2240 then 60 - characters of markup were added to the total giving - 2300. - - Uncompressed text length is used because it's easily - accessible in MOBI files (part of the header). Also, - It's faster to work off of the length then to - decompress and parse the actual text. - - A better but much more resource intensive and slower - method to calculate the page length would be to parse - the uncompressed text. For each paragraph we would - want to find how many lines it would occupy in a paper - back book. 70 characters per line and 32 lines per page. - So divide the number of characters (minus markup) in - each paragraph by 70. If there are less than 70 - characters in the paragraph then it is 1 line. Then, - count every 32 lines and mark that location as a page. + Create an APNX file using a pseudo page mapping. ''' - def write_apnx(self, mobi_file_path, apnx_path): + def write_apnx(self, mobi_file_path, apnx_path, accurate=True): + # Check that this is really a MOBI file. with open(mobi_file_path, 'rb') as mf: - phead = PdbHeaderReader(mf) - r0 = phead.section_data(0) - text_length = struct.unpack('>I', r0[4:8])[0] + ident = PdbHeaderReader(mf).identity() + if ident != 'BOOKMOBI': + raise Exception(_('Not a valid MOBI file. Reports identity of %s' % ident)) - pages = self.get_pages(text_length) + # Get the pages depending on the chosen parser + pages = [] + if accurate: + try: + pages = self.get_pages_accurate(mobi_file_path) + except: + # Fall back to the fast parser if we can't + # use the accurate one. Typically this is + # due to the file having DRM. + pages = self.get_pages_fast(mobi_file_path) + else: + pages = self.get_pages_fast(mobi_file_path) + + if not pages: + raise Exception(_('Could not generate page mapping.')) + + # Generate the APNX file from the page mapping. apnx = self.generate_apnx(pages) + # Write the APNX. with open(apnx_path, 'wb') as apnxf: apnxf.write(apnx) @@ -73,18 +71,126 @@ class APNXBuilder(object): apnx += struct.pack('>H', 32) apnx += page_header - # write page values to apnx + # Write page values to APNX. for page in pages: - apnx += struct.pack('>L', page) + apnx += struct.pack('>I', page) return apnx - def get_pages(self, text_length): + def get_pages_fast(self, mobi_file_path): + ''' + 2300 characters of uncompressed text per page. This is + not meant to map 1 to 1 to a print book but to be a + close enough measure. + + A test book was chosen and the characters were counted + on one page. This number was round to 2240 then 60 + characters of markup were added to the total giving + 2300. + + Uncompressed text length is used because it's easily + accessible in MOBI files (part of the header). Also, + It's faster to work off of the length then to + decompress and parse the actual text. + ''' + text_length = 0 pages = [] count = 0 + with open(mobi_file_path, 'rb') as mf: + phead = PdbHeaderReader(mf) + r0 = phead.section_data(0) + text_length = struct.unpack('>I', r0[4:8])[0] + while count < text_length: pages.append(count) count += 2300 return pages + + def get_pages_accurate(self, mobi_file_path): + ''' + A more accurate but much more resource intensive and slower + method to calculate the page length. + + Parses the uncompressed text. In an average paper back book + There are 32 lines per page and a maximum of 70 characters + per line. + + Each paragraph starts a new line and every 70 characters + (minus markup) in a paragraph starts a new line. The + position after every 30 lines will be marked as a new + page. + + This can be make more accurate by accounting for +
as a new page marker. + And