mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Kindle Interface: APNX add support for accurate generation. Add comments and descriptions to APNX file. Add interface options for controlling how and if the APNX file is generated.
This commit is contained in:
parent
527bce3e5e
commit
e0a0c08839
@ -11,44 +11,43 @@ Generates and writes an APNX page mapping file.
|
||||
import struct
|
||||
import uuid
|
||||
|
||||
from calibre.ebooks import DRMError
|
||||
from calibre.ebooks.mobi.reader import MobiReader
|
||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||
from calibre.utils.logging import default_log
|
||||
|
||||
class APNXBuilder(object):
|
||||
'''
|
||||
2300 characters of uncompressed text per page. This is
|
||||
not meant to map 1 to 1 to a print book but to be a
|
||||
close enough measure.
|
||||
|
||||
A test book was chosen and the characters were counted
|
||||
on one page. This number was round to 2240 then 60
|
||||
characters of markup were added to the total giving
|
||||
2300.
|
||||
|
||||
Uncompressed text length is used because it's easily
|
||||
accessible in MOBI files (part of the header). Also,
|
||||
It's faster to work off of the length then to
|
||||
decompress and parse the actual text.
|
||||
|
||||
A better but much more resource intensive and slower
|
||||
method to calculate the page length would be to parse
|
||||
the uncompressed text. For each paragraph we would
|
||||
want to find how many lines it would occupy in a paper
|
||||
back book. 70 characters per line and 32 lines per page.
|
||||
So divide the number of characters (minus markup) in
|
||||
each paragraph by 70. If there are less than 70
|
||||
characters in the paragraph then it is 1 line. Then,
|
||||
count every 32 lines and mark that location as a page.
|
||||
Create an APNX file using a pseudo page mapping.
|
||||
'''
|
||||
|
||||
def write_apnx(self, mobi_file_path, apnx_path):
|
||||
def write_apnx(self, mobi_file_path, apnx_path, accurate=True):
|
||||
# Check that this is really a MOBI file.
|
||||
with open(mobi_file_path, 'rb') as mf:
|
||||
phead = PdbHeaderReader(mf)
|
||||
r0 = phead.section_data(0)
|
||||
text_length = struct.unpack('>I', r0[4:8])[0]
|
||||
ident = PdbHeaderReader(mf).identity()
|
||||
if ident != 'BOOKMOBI':
|
||||
raise Exception(_('Not a valid MOBI file. Reports identity of %s' % ident))
|
||||
|
||||
pages = self.get_pages(text_length)
|
||||
# Get the pages depending on the chosen parser
|
||||
pages = []
|
||||
if accurate:
|
||||
try:
|
||||
pages = self.get_pages_accurate(mobi_file_path)
|
||||
except:
|
||||
# Fall back to the fast parser if we can't
|
||||
# use the accurate one. Typically this is
|
||||
# due to the file having DRM.
|
||||
pages = self.get_pages_fast(mobi_file_path)
|
||||
else:
|
||||
pages = self.get_pages_fast(mobi_file_path)
|
||||
|
||||
if not pages:
|
||||
raise Exception(_('Could not generate page mapping.'))
|
||||
|
||||
# Generate the APNX file from the page mapping.
|
||||
apnx = self.generate_apnx(pages)
|
||||
|
||||
# Write the APNX.
|
||||
with open(apnx_path, 'wb') as apnxf:
|
||||
apnxf.write(apnx)
|
||||
|
||||
@ -73,18 +72,125 @@ class APNXBuilder(object):
|
||||
apnx += struct.pack('>H', 32)
|
||||
apnx += page_header
|
||||
|
||||
# write page values to apnx
|
||||
# Write page values to APNX.
|
||||
for page in pages:
|
||||
apnx += struct.pack('>L', page)
|
||||
apnx += struct.pack('>I', page)
|
||||
|
||||
return apnx
|
||||
|
||||
def get_pages(self, text_length):
|
||||
def get_pages_fast(self, mobi_file_path):
|
||||
'''
|
||||
2300 characters of uncompressed text per page. This is
|
||||
not meant to map 1 to 1 to a print book but to be a
|
||||
close enough measure.
|
||||
|
||||
A test book was chosen and the characters were counted
|
||||
on one page. This number was round to 2240 then 60
|
||||
characters of markup were added to the total giving
|
||||
2300.
|
||||
|
||||
Uncompressed text length is used because it's easily
|
||||
accessible in MOBI files (part of the header). Also,
|
||||
It's faster to work off of the length then to
|
||||
decompress and parse the actual text.
|
||||
'''
|
||||
text_length = 0
|
||||
pages = []
|
||||
count = 0
|
||||
|
||||
with open(mobi_file_path, 'rb') as mf:
|
||||
phead = PdbHeaderReader(mf)
|
||||
r0 = phead.section_data(0)
|
||||
text_length = struct.unpack('>I', r0[4:8])[0]
|
||||
|
||||
while count < text_length:
|
||||
pages.append(count)
|
||||
count += 2300
|
||||
|
||||
return pages
|
||||
|
||||
def get_pages_accurate(self, mobi_file_path):
|
||||
'''
|
||||
A more accurate but much more resource intensive and slower
|
||||
method to calculate the page length.
|
||||
|
||||
Parses the uncompressed text. In an average paper back book
|
||||
There are 32 lines per page and a maximum of 70 characters
|
||||
per line.
|
||||
|
||||
Each paragraph starts a new line and every 70 characters
|
||||
(minus markup) in a paragraph starts a new line. The
|
||||
position after every 30 lines will be marked as a new
|
||||
page.
|
||||
|
||||
This can be make more accurate by accounting for
|
||||
<div class="mbp_pagebreak" /> as a new page marker.
|
||||
And <br> elements as an empty line.
|
||||
'''
|
||||
pages = []
|
||||
|
||||
# Get the MOBI html.
|
||||
mr = MobiReader(mobi_file_path, default_log)
|
||||
if mr.book_header.encryption_type != 0:
|
||||
raise DRMError()
|
||||
mr.extract_text()
|
||||
|
||||
# States
|
||||
in_tag = False
|
||||
in_p = False
|
||||
check_p = False
|
||||
closing = False
|
||||
p_char_count = 0
|
||||
|
||||
# Get positions of every line
|
||||
# A line is either a paragraph starting
|
||||
# or every 70 characters in a paragraph.
|
||||
lines = []
|
||||
pos = -1
|
||||
# We want this to be as fast as possible so we
|
||||
# are going to do one pass across the text. re
|
||||
# and string functions will parse the text each
|
||||
# time they are called.
|
||||
#
|
||||
# We can can use .lower() here because we are
|
||||
# not modifying the text. In this case the case
|
||||
# doesn't matter just the absolute character and
|
||||
# the position within the stream.
|
||||
for c in mr.mobi_html.lower():
|
||||
pos += 1
|
||||
|
||||
# Check if we are starting or stopping a p tag.
|
||||
if check_p:
|
||||
if c == '/':
|
||||
closing = True
|
||||
continue
|
||||
elif c == 'p':
|
||||
if closing:
|
||||
in_p = False
|
||||
else:
|
||||
in_p = True
|
||||
lines.append(pos - 2)
|
||||
check_p = False
|
||||
closing = False
|
||||
continue
|
||||
|
||||
if c == '<':
|
||||
in_tag = True
|
||||
check_p = True
|
||||
continue
|
||||
elif c == '>':
|
||||
in_tag = False
|
||||
check_p = False
|
||||
continue
|
||||
|
||||
if in_p and not in_tag:
|
||||
p_char_count += 1
|
||||
if p_char_count == 70:
|
||||
lines.append(pos)
|
||||
p_char_count = 0
|
||||
|
||||
# Every 30 lines is a new page
|
||||
for i in xrange(0, len(lines), 32):
|
||||
pages.append(lines[i])
|
||||
|
||||
return pages
|
||||
|
@ -176,6 +176,27 @@ class KINDLE2(KINDLE):
|
||||
PRODUCT_ID = [0x0002, 0x0004]
|
||||
BCD = [0x0100]
|
||||
|
||||
EXTRA_CUSTOMIZATION_MESSAGE = [
|
||||
_('Write page mapping (APNX) file when sending books') +
|
||||
':::' +
|
||||
_('The APNX page mapping file is a new feature in the Kindle 3\'s '
|
||||
'3.1 firmware. It allows for page numbers to that correspond to pages '
|
||||
'in a print book. This will write an APNX file that uses pseudo page '
|
||||
'numbers based on the the average page length in a paper back book.'),
|
||||
_('Use slower but more accurate APNX generation') +
|
||||
':::' +
|
||||
_('There are two ways to generate the APNX file. Using the more accurate '
|
||||
'generator will produce pages that correspond better to a printed book. '
|
||||
'However, this method is slower and more intensive. Unchecking this '
|
||||
'option will default to using the faster but less accurate generator.'),
|
||||
]
|
||||
EXTRA_CUSTOMIZATION_DEFAULT = [
|
||||
True,
|
||||
True,
|
||||
]
|
||||
OPT_APNX = 0
|
||||
OPT_APNX_ACCURATE = 1
|
||||
|
||||
def books(self, oncard=None, end_session=True):
|
||||
bl = USBMS.books(self, oncard=oncard, end_session=end_session)
|
||||
# Read collections information
|
||||
@ -212,13 +233,17 @@ class KINDLE2(KINDLE):
|
||||
'''
|
||||
Hijacking this function to write the apnx file.
|
||||
'''
|
||||
if not filepath.lower().endswith('.mobi'):
|
||||
opts = self.settings()
|
||||
if not opts.extra_customization[self.OPT_APNX]:
|
||||
return
|
||||
|
||||
if os.path.splitext(filepath.lower())[1] not in ('.azw', '.mobi', '.prc'):
|
||||
return
|
||||
|
||||
apnx_path = '%s.apnx' % os.path.join(path, filename)
|
||||
apnx_builder = APNXBuilder()
|
||||
try:
|
||||
apnx_builder.write_apnx(filepath, apnx_path)
|
||||
apnx_builder.write_apnx(filepath, apnx_path, accurate=opts.extra_customization[self.OPT_APNX_ACCURATE])
|
||||
except:
|
||||
print 'Failed to generate APNX'
|
||||
import traceback
|
||||
|
Loading…
x
Reference in New Issue
Block a user