mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Kindle driver: Add an option to turn off sending page number information. Also add an option to use a more accurate but slower algorithm to calculate page numbers
This commit is contained in:
commit
6ee8ffb9ff
@ -11,44 +11,42 @@ Generates and writes an APNX page mapping file.
|
||||
import struct
|
||||
import uuid
|
||||
|
||||
from calibre.ebooks.mobi.reader import MobiReader
|
||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||
from calibre.utils.logging import default_log
|
||||
|
||||
class APNXBuilder(object):
|
||||
'''
|
||||
2300 characters of uncompressed text per page. This is
|
||||
not meant to map 1 to 1 to a print book but to be a
|
||||
close enough measure.
|
||||
|
||||
A test book was chosen and the characters were counted
|
||||
on one page. This number was round to 2240 then 60
|
||||
characters of markup were added to the total giving
|
||||
2300.
|
||||
|
||||
Uncompressed text length is used because it's easily
|
||||
accessible in MOBI files (part of the header). Also,
|
||||
It's faster to work off of the length then to
|
||||
decompress and parse the actual text.
|
||||
|
||||
A better but much more resource intensive and slower
|
||||
method to calculate the page length would be to parse
|
||||
the uncompressed text. For each paragraph we would
|
||||
want to find how many lines it would occupy in a paper
|
||||
back book. 70 characters per line and 32 lines per page.
|
||||
So divide the number of characters (minus markup) in
|
||||
each paragraph by 70. If there are less than 70
|
||||
characters in the paragraph then it is 1 line. Then,
|
||||
count every 32 lines and mark that location as a page.
|
||||
Create an APNX file using a pseudo page mapping.
|
||||
'''
|
||||
|
||||
def write_apnx(self, mobi_file_path, apnx_path):
|
||||
def write_apnx(self, mobi_file_path, apnx_path, accurate=True):
|
||||
# Check that this is really a MOBI file.
|
||||
with open(mobi_file_path, 'rb') as mf:
|
||||
phead = PdbHeaderReader(mf)
|
||||
r0 = phead.section_data(0)
|
||||
text_length = struct.unpack('>I', r0[4:8])[0]
|
||||
ident = PdbHeaderReader(mf).identity()
|
||||
if ident != 'BOOKMOBI':
|
||||
raise Exception(_('Not a valid MOBI file. Reports identity of %s' % ident))
|
||||
|
||||
pages = self.get_pages(text_length)
|
||||
# Get the pages depending on the chosen parser
|
||||
pages = []
|
||||
if accurate:
|
||||
try:
|
||||
pages = self.get_pages_accurate(mobi_file_path)
|
||||
except:
|
||||
# Fall back to the fast parser if we can't
|
||||
# use the accurate one. Typically this is
|
||||
# due to the file having DRM.
|
||||
pages = self.get_pages_fast(mobi_file_path)
|
||||
else:
|
||||
pages = self.get_pages_fast(mobi_file_path)
|
||||
|
||||
if not pages:
|
||||
raise Exception(_('Could not generate page mapping.'))
|
||||
|
||||
# Generate the APNX file from the page mapping.
|
||||
apnx = self.generate_apnx(pages)
|
||||
|
||||
# Write the APNX.
|
||||
with open(apnx_path, 'wb') as apnxf:
|
||||
apnxf.write(apnx)
|
||||
|
||||
@ -73,18 +71,126 @@ class APNXBuilder(object):
|
||||
apnx += struct.pack('>H', 32)
|
||||
apnx += page_header
|
||||
|
||||
# write page values to apnx
|
||||
# Write page values to APNX.
|
||||
for page in pages:
|
||||
apnx += struct.pack('>L', page)
|
||||
apnx += struct.pack('>I', page)
|
||||
|
||||
return apnx
|
||||
|
||||
def get_pages(self, text_length):
|
||||
def get_pages_fast(self, mobi_file_path):
|
||||
'''
|
||||
2300 characters of uncompressed text per page. This is
|
||||
not meant to map 1 to 1 to a print book but to be a
|
||||
close enough measure.
|
||||
|
||||
A test book was chosen and the characters were counted
|
||||
on one page. This number was round to 2240 then 60
|
||||
characters of markup were added to the total giving
|
||||
2300.
|
||||
|
||||
Uncompressed text length is used because it's easily
|
||||
accessible in MOBI files (part of the header). Also,
|
||||
It's faster to work off of the length then to
|
||||
decompress and parse the actual text.
|
||||
'''
|
||||
text_length = 0
|
||||
pages = []
|
||||
count = 0
|
||||
|
||||
with open(mobi_file_path, 'rb') as mf:
|
||||
phead = PdbHeaderReader(mf)
|
||||
r0 = phead.section_data(0)
|
||||
text_length = struct.unpack('>I', r0[4:8])[0]
|
||||
|
||||
while count < text_length:
|
||||
pages.append(count)
|
||||
count += 2300
|
||||
|
||||
return pages
|
||||
|
||||
def get_pages_accurate(self, mobi_file_path):
|
||||
'''
|
||||
A more accurate but much more resource intensive and slower
|
||||
method to calculate the page length.
|
||||
|
||||
Parses the uncompressed text. In an average paper back book
|
||||
There are 32 lines per page and a maximum of 70 characters
|
||||
per line.
|
||||
|
||||
Each paragraph starts a new line and every 70 characters
|
||||
(minus markup) in a paragraph starts a new line. The
|
||||
position after every 30 lines will be marked as a new
|
||||
page.
|
||||
|
||||
This can be make more accurate by accounting for
|
||||
<div class="mbp_pagebreak" /> as a new page marker.
|
||||
And <br> elements as an empty line.
|
||||
'''
|
||||
pages = []
|
||||
|
||||
# Get the MOBI html.
|
||||
mr = MobiReader(mobi_file_path, default_log)
|
||||
if mr.book_header.encryption_type != 0:
|
||||
# DRMed book
|
||||
return self.get_pages_fast(mobi_file_path)
|
||||
mr.extract_text()
|
||||
|
||||
# States
|
||||
in_tag = False
|
||||
in_p = False
|
||||
check_p = False
|
||||
closing = False
|
||||
p_char_count = 0
|
||||
|
||||
# Get positions of every line
|
||||
# A line is either a paragraph starting
|
||||
# or every 70 characters in a paragraph.
|
||||
lines = []
|
||||
pos = -1
|
||||
# We want this to be as fast as possible so we
|
||||
# are going to do one pass across the text. re
|
||||
# and string functions will parse the text each
|
||||
# time they are called.
|
||||
#
|
||||
# We can can use .lower() here because we are
|
||||
# not modifying the text. In this case the case
|
||||
# doesn't matter just the absolute character and
|
||||
# the position within the stream.
|
||||
for c in mr.mobi_html.lower():
|
||||
pos += 1
|
||||
|
||||
# Check if we are starting or stopping a p tag.
|
||||
if check_p:
|
||||
if c == '/':
|
||||
closing = True
|
||||
continue
|
||||
elif c == 'p':
|
||||
if closing:
|
||||
in_p = False
|
||||
else:
|
||||
in_p = True
|
||||
lines.append(pos - 2)
|
||||
check_p = False
|
||||
closing = False
|
||||
continue
|
||||
|
||||
if c == '<':
|
||||
in_tag = True
|
||||
check_p = True
|
||||
continue
|
||||
elif c == '>':
|
||||
in_tag = False
|
||||
check_p = False
|
||||
continue
|
||||
|
||||
if in_p and not in_tag:
|
||||
p_char_count += 1
|
||||
if p_char_count == 70:
|
||||
lines.append(pos)
|
||||
p_char_count = 0
|
||||
|
||||
# Every 30 lines is a new page
|
||||
for i in xrange(0, len(lines), 32):
|
||||
pages.append(lines[i])
|
||||
|
||||
return pages
|
||||
|
@ -176,6 +176,28 @@ class KINDLE2(KINDLE):
|
||||
PRODUCT_ID = [0x0002, 0x0004]
|
||||
BCD = [0x0100]
|
||||
|
||||
EXTRA_CUSTOMIZATION_MESSAGE = [
|
||||
_('Send page number information when sending books') +
|
||||
':::' +
|
||||
_('The Kindle 3 and newer versions can use page number information '
|
||||
'in MOBI files. With this option, calibre will calculate and send'
|
||||
' this information to the Kindle when uploading MOBI files by'
|
||||
' USB. Note that the page numbers do not correspond to any paper'
|
||||
' book.'),
|
||||
_('Use slower but more accurate page number generation') +
|
||||
':::' +
|
||||
_('There are two ways to generate the page number information. Using the more accurate '
|
||||
'generator will produce pages that correspond better to a printed book. '
|
||||
'However, this method is slower and will slow down sending files '
|
||||
'to the Kindle.'),
|
||||
]
|
||||
EXTRA_CUSTOMIZATION_DEFAULT = [
|
||||
True,
|
||||
False,
|
||||
]
|
||||
OPT_APNX = 0
|
||||
OPT_APNX_ACCURATE = 1
|
||||
|
||||
def books(self, oncard=None, end_session=True):
|
||||
bl = USBMS.books(self, oncard=oncard, end_session=end_session)
|
||||
# Read collections information
|
||||
@ -212,13 +234,17 @@ class KINDLE2(KINDLE):
|
||||
'''
|
||||
Hijacking this function to write the apnx file.
|
||||
'''
|
||||
if not filepath.lower().endswith('.mobi'):
|
||||
opts = self.settings()
|
||||
if not opts.extra_customization[self.OPT_APNX]:
|
||||
return
|
||||
|
||||
if os.path.splitext(filepath.lower())[1] not in ('.azw', '.mobi', '.prc'):
|
||||
return
|
||||
|
||||
apnx_path = '%s.apnx' % os.path.join(path, filename)
|
||||
apnx_builder = APNXBuilder()
|
||||
try:
|
||||
apnx_builder.write_apnx(filepath, apnx_path)
|
||||
apnx_builder.write_apnx(filepath, apnx_path, accurate=opts.extra_customization[self.OPT_APNX_ACCURATE])
|
||||
except:
|
||||
print 'Failed to generate APNX'
|
||||
import traceback
|
||||
|
Loading…
x
Reference in New Issue
Block a user