Kindle Interface: APNX add support for accurate generation. Add comments and descriptions to APNX file. Add interface options for controlling how and if the APNX file is generated.

This commit is contained in:
John Schember 2011-02-12 14:07:32 -05:00
parent 527bce3e5e
commit e0a0c08839
2 changed files with 164 additions and 33 deletions

View File

@ -11,44 +11,43 @@ Generates and writes an APNX page mapping file.
import struct
import uuid
from calibre.ebooks import DRMError
from calibre.ebooks.mobi.reader import MobiReader
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.utils.logging import default_log
class APNXBuilder(object):
'''
2300 characters of uncompressed text per page. This is
not meant to map 1 to 1 to a print book but to be a
close enough measure.
A test book was chosen and the characters were counted
on one page. This number was round to 2240 then 60
characters of markup were added to the total giving
2300.
Uncompressed text length is used because it's easily
accessible in MOBI files (part of the header). Also,
It's faster to work off of the length then to
decompress and parse the actual text.
A better but much more resource intensive and slower
method to calculate the page length would be to parse
the uncompressed text. For each paragraph we would
want to find how many lines it would occupy in a paper
back book. 70 characters per line and 32 lines per page.
So divide the number of characters (minus markup) in
each paragraph by 70. If there are less than 70
characters in the paragraph then it is 1 line. Then,
count every 32 lines and mark that location as a page.
Create an APNX file using a pseudo page mapping.
'''
def write_apnx(self, mobi_file_path, apnx_path):
def write_apnx(self, mobi_file_path, apnx_path, accurate=True):
# Check that this is really a MOBI file.
with open(mobi_file_path, 'rb') as mf:
phead = PdbHeaderReader(mf)
r0 = phead.section_data(0)
text_length = struct.unpack('>I', r0[4:8])[0]
ident = PdbHeaderReader(mf).identity()
if ident != 'BOOKMOBI':
raise Exception(_('Not a valid MOBI file. Reports identity of %s' % ident))
pages = self.get_pages(text_length)
# Get the pages depending on the chosen parser
pages = []
if accurate:
try:
pages = self.get_pages_accurate(mobi_file_path)
except:
# Fall back to the fast parser if we can't
# use the accurate one. Typically this is
# due to the file having DRM.
pages = self.get_pages_fast(mobi_file_path)
else:
pages = self.get_pages_fast(mobi_file_path)
if not pages:
raise Exception(_('Could not generate page mapping.'))
# Generate the APNX file from the page mapping.
apnx = self.generate_apnx(pages)
# Write the APNX.
with open(apnx_path, 'wb') as apnxf:
apnxf.write(apnx)
@ -73,18 +72,125 @@ class APNXBuilder(object):
apnx += struct.pack('>H', 32)
apnx += page_header
# write page values to apnx
# Write page values to APNX.
for page in pages:
apnx += struct.pack('>L', page)
apnx += struct.pack('>I', page)
return apnx
def get_pages(self, text_length):
def get_pages_fast(self, mobi_file_path):
'''
2300 characters of uncompressed text per page. This is
not meant to map 1 to 1 to a print book but to be a
close enough measure.
A test book was chosen and the characters were counted
on one page. This number was round to 2240 then 60
characters of markup were added to the total giving
2300.
Uncompressed text length is used because it's easily
accessible in MOBI files (part of the header). Also,
It's faster to work off of the length then to
decompress and parse the actual text.
'''
text_length = 0
pages = []
count = 0
with open(mobi_file_path, 'rb') as mf:
phead = PdbHeaderReader(mf)
r0 = phead.section_data(0)
text_length = struct.unpack('>I', r0[4:8])[0]
while count < text_length:
pages.append(count)
count += 2300
return pages
def get_pages_accurate(self, mobi_file_path):
'''
A more accurate but much more resource intensive and slower
method to calculate the page length.
Parses the uncompressed text. In an average paper back book
There are 32 lines per page and a maximum of 70 characters
per line.
Each paragraph starts a new line and every 70 characters
(minus markup) in a paragraph starts a new line. The
position after every 30 lines will be marked as a new
page.
This can be make more accurate by accounting for
<div class="mbp_pagebreak" /> as a new page marker.
And <br> elements as an empty line.
'''
pages = []
# Get the MOBI html.
mr = MobiReader(mobi_file_path, default_log)
if mr.book_header.encryption_type != 0:
raise DRMError()
mr.extract_text()
# States
in_tag = False
in_p = False
check_p = False
closing = False
p_char_count = 0
# Get positions of every line
# A line is either a paragraph starting
# or every 70 characters in a paragraph.
lines = []
pos = -1
# We want this to be as fast as possible so we
# are going to do one pass across the text. re
# and string functions will parse the text each
# time they are called.
#
# We can can use .lower() here because we are
# not modifying the text. In this case the case
# doesn't matter just the absolute character and
# the position within the stream.
for c in mr.mobi_html.lower():
pos += 1
# Check if we are starting or stopping a p tag.
if check_p:
if c == '/':
closing = True
continue
elif c == 'p':
if closing:
in_p = False
else:
in_p = True
lines.append(pos - 2)
check_p = False
closing = False
continue
if c == '<':
in_tag = True
check_p = True
continue
elif c == '>':
in_tag = False
check_p = False
continue
if in_p and not in_tag:
p_char_count += 1
if p_char_count == 70:
lines.append(pos)
p_char_count = 0
# Every 30 lines is a new page
for i in xrange(0, len(lines), 32):
pages.append(lines[i])
return pages

View File

@ -175,6 +175,27 @@ class KINDLE2(KINDLE):
PRODUCT_ID = [0x0002, 0x0004]
BCD = [0x0100]
EXTRA_CUSTOMIZATION_MESSAGE = [
_('Write page mapping (APNX) file when sending books') +
':::' +
_('The APNX page mapping file is a new feature in the Kindle 3\'s '
'3.1 firmware. It allows for page numbers to that correspond to pages '
'in a print book. This will write an APNX file that uses pseudo page '
'numbers based on the the average page length in a paper back book.'),
_('Use slower but more accurate APNX generation') +
':::' +
_('There are two ways to generate the APNX file. Using the more accurate '
'generator will produce pages that correspond better to a printed book. '
'However, this method is slower and more intensive. Unchecking this '
'option will default to using the faster but less accurate generator.'),
]
EXTRA_CUSTOMIZATION_DEFAULT = [
True,
True,
]
OPT_APNX = 0
OPT_APNX_ACCURATE = 1
def books(self, oncard=None, end_session=True):
bl = USBMS.books(self, oncard=oncard, end_session=end_session)
@ -212,13 +233,17 @@ class KINDLE2(KINDLE):
'''
Hijacking this function to write the apnx file.
'''
if not filepath.lower().endswith('.mobi'):
opts = self.settings()
if not opts.extra_customization[self.OPT_APNX]:
return
if os.path.splitext(filepath.lower())[1] not in ('.azw', '.mobi', '.prc'):
return
apnx_path = '%s.apnx' % os.path.join(path, filename)
apnx_builder = APNXBuilder()
try:
apnx_builder.write_apnx(filepath, apnx_path)
apnx_builder.write_apnx(filepath, apnx_path, accurate=opts.extra_customization[self.OPT_APNX_ACCURATE])
except:
print 'Failed to generate APNX'
import traceback