Kindle driver: Add an option to turn off sending page number information. Also add an option to use a more accurate but slower algorithm to calculate page numbers

This commit is contained in:
Kovid Goyal 2011-02-13 16:43:53 -07:00
commit 6ee8ffb9ff
2 changed files with 165 additions and 33 deletions

View File

@ -11,44 +11,42 @@ Generates and writes an APNX page mapping file.
import struct
import uuid
from calibre.ebooks.mobi.reader import MobiReader
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.utils.logging import default_log
class APNXBuilder(object):
'''
2300 characters of uncompressed text per page. This is
not meant to map 1 to 1 to a print book but to be a
close enough measure.
A test book was chosen and the characters were counted
on one page. This number was round to 2240 then 60
characters of markup were added to the total giving
2300.
Uncompressed text length is used because it's easily
accessible in MOBI files (part of the header). Also,
It's faster to work off of the length then to
decompress and parse the actual text.
A better but much more resource intensive and slower
method to calculate the page length would be to parse
the uncompressed text. For each paragraph we would
want to find how many lines it would occupy in a paper
back book. 70 characters per line and 32 lines per page.
So divide the number of characters (minus markup) in
each paragraph by 70. If there are less than 70
characters in the paragraph then it is 1 line. Then,
count every 32 lines and mark that location as a page.
Create an APNX file using a pseudo page mapping.
'''
def write_apnx(self, mobi_file_path, apnx_path):
def write_apnx(self, mobi_file_path, apnx_path, accurate=True):
# Check that this is really a MOBI file.
with open(mobi_file_path, 'rb') as mf:
phead = PdbHeaderReader(mf)
r0 = phead.section_data(0)
text_length = struct.unpack('>I', r0[4:8])[0]
ident = PdbHeaderReader(mf).identity()
if ident != 'BOOKMOBI':
raise Exception(_('Not a valid MOBI file. Reports identity of %s' % ident))
pages = self.get_pages(text_length)
# Get the pages depending on the chosen parser
pages = []
if accurate:
try:
pages = self.get_pages_accurate(mobi_file_path)
except:
# Fall back to the fast parser if we can't
# use the accurate one. Typically this is
# due to the file having DRM.
pages = self.get_pages_fast(mobi_file_path)
else:
pages = self.get_pages_fast(mobi_file_path)
if not pages:
raise Exception(_('Could not generate page mapping.'))
# Generate the APNX file from the page mapping.
apnx = self.generate_apnx(pages)
# Write the APNX.
with open(apnx_path, 'wb') as apnxf:
apnxf.write(apnx)
@ -73,18 +71,126 @@ class APNXBuilder(object):
apnx += struct.pack('>H', 32)
apnx += page_header
# write page values to apnx
# Write page values to APNX.
for page in pages:
apnx += struct.pack('>L', page)
apnx += struct.pack('>I', page)
return apnx
def get_pages(self, text_length):
def get_pages_fast(self, mobi_file_path):
'''
2300 characters of uncompressed text per page. This is
not meant to map 1 to 1 to a print book but to be a
close enough measure.
A test book was chosen and the characters were counted
on one page. This number was round to 2240 then 60
characters of markup were added to the total giving
2300.
Uncompressed text length is used because it's easily
accessible in MOBI files (part of the header). Also,
It's faster to work off of the length then to
decompress and parse the actual text.
'''
text_length = 0
pages = []
count = 0
with open(mobi_file_path, 'rb') as mf:
phead = PdbHeaderReader(mf)
r0 = phead.section_data(0)
text_length = struct.unpack('>I', r0[4:8])[0]
while count < text_length:
pages.append(count)
count += 2300
return pages
def get_pages_accurate(self, mobi_file_path):
'''
A more accurate but much more resource intensive and slower
method to calculate the page length.
Parses the uncompressed text. In an average paper back book
There are 32 lines per page and a maximum of 70 characters
per line.
Each paragraph starts a new line and every 70 characters
(minus markup) in a paragraph starts a new line. The
position after every 30 lines will be marked as a new
page.
This can be make more accurate by accounting for
<div class="mbp_pagebreak" /> as a new page marker.
And <br> elements as an empty line.
'''
pages = []
# Get the MOBI html.
mr = MobiReader(mobi_file_path, default_log)
if mr.book_header.encryption_type != 0:
# DRMed book
return self.get_pages_fast(mobi_file_path)
mr.extract_text()
# States
in_tag = False
in_p = False
check_p = False
closing = False
p_char_count = 0
# Get positions of every line
# A line is either a paragraph starting
# or every 70 characters in a paragraph.
lines = []
pos = -1
# We want this to be as fast as possible so we
# are going to do one pass across the text. re
# and string functions will parse the text each
# time they are called.
#
# We can can use .lower() here because we are
# not modifying the text. In this case the case
# doesn't matter just the absolute character and
# the position within the stream.
for c in mr.mobi_html.lower():
pos += 1
# Check if we are starting or stopping a p tag.
if check_p:
if c == '/':
closing = True
continue
elif c == 'p':
if closing:
in_p = False
else:
in_p = True
lines.append(pos - 2)
check_p = False
closing = False
continue
if c == '<':
in_tag = True
check_p = True
continue
elif c == '>':
in_tag = False
check_p = False
continue
if in_p and not in_tag:
p_char_count += 1
if p_char_count == 70:
lines.append(pos)
p_char_count = 0
# Every 30 lines is a new page
for i in xrange(0, len(lines), 32):
pages.append(lines[i])
return pages

View File

@ -176,6 +176,28 @@ class KINDLE2(KINDLE):
PRODUCT_ID = [0x0002, 0x0004]
BCD = [0x0100]
EXTRA_CUSTOMIZATION_MESSAGE = [
_('Send page number information when sending books') +
':::' +
_('The Kindle 3 and newer versions can use page number information '
'in MOBI files. With this option, calibre will calculate and send'
' this information to the Kindle when uploading MOBI files by'
' USB. Note that the page numbers do not correspond to any paper'
' book.'),
_('Use slower but more accurate page number generation') +
':::' +
_('There are two ways to generate the page number information. Using the more accurate '
'generator will produce pages that correspond better to a printed book. '
'However, this method is slower and will slow down sending files '
'to the Kindle.'),
]
EXTRA_CUSTOMIZATION_DEFAULT = [
True,
False,
]
OPT_APNX = 0
OPT_APNX_ACCURATE = 1
def books(self, oncard=None, end_session=True):
bl = USBMS.books(self, oncard=oncard, end_session=end_session)
# Read collections information
@ -212,13 +234,17 @@ class KINDLE2(KINDLE):
'''
Hijacking this function to write the apnx file.
'''
if not filepath.lower().endswith('.mobi'):
opts = self.settings()
if not opts.extra_customization[self.OPT_APNX]:
return
if os.path.splitext(filepath.lower())[1] not in ('.azw', '.mobi', '.prc'):
return
apnx_path = '%s.apnx' % os.path.join(path, filename)
apnx_builder = APNXBuilder()
try:
apnx_builder.write_apnx(filepath, apnx_path)
apnx_builder.write_apnx(filepath, apnx_path, accurate=opts.extra_customization[self.OPT_APNX_ACCURATE])
except:
print 'Failed to generate APNX'
import traceback