mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Kindle driver: Add an option to turn off sending page number information. Also add an option to use a more accurate but slower algorithm to calculate page numbers
This commit is contained in:
commit
6ee8ffb9ff
@ -11,44 +11,42 @@ Generates and writes an APNX page mapping file.
|
|||||||
import struct
|
import struct
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
|
from calibre.ebooks.mobi.reader import MobiReader
|
||||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||||
|
from calibre.utils.logging import default_log
|
||||||
|
|
||||||
class APNXBuilder(object):
|
class APNXBuilder(object):
|
||||||
'''
|
'''
|
||||||
2300 characters of uncompressed text per page. This is
|
Create an APNX file using a pseudo page mapping.
|
||||||
not meant to map 1 to 1 to a print book but to be a
|
|
||||||
close enough measure.
|
|
||||||
|
|
||||||
A test book was chosen and the characters were counted
|
|
||||||
on one page. This number was round to 2240 then 60
|
|
||||||
characters of markup were added to the total giving
|
|
||||||
2300.
|
|
||||||
|
|
||||||
Uncompressed text length is used because it's easily
|
|
||||||
accessible in MOBI files (part of the header). Also,
|
|
||||||
It's faster to work off of the length then to
|
|
||||||
decompress and parse the actual text.
|
|
||||||
|
|
||||||
A better but much more resource intensive and slower
|
|
||||||
method to calculate the page length would be to parse
|
|
||||||
the uncompressed text. For each paragraph we would
|
|
||||||
want to find how many lines it would occupy in a paper
|
|
||||||
back book. 70 characters per line and 32 lines per page.
|
|
||||||
So divide the number of characters (minus markup) in
|
|
||||||
each paragraph by 70. If there are less than 70
|
|
||||||
characters in the paragraph then it is 1 line. Then,
|
|
||||||
count every 32 lines and mark that location as a page.
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def write_apnx(self, mobi_file_path, apnx_path):
|
def write_apnx(self, mobi_file_path, apnx_path, accurate=True):
|
||||||
|
# Check that this is really a MOBI file.
|
||||||
with open(mobi_file_path, 'rb') as mf:
|
with open(mobi_file_path, 'rb') as mf:
|
||||||
phead = PdbHeaderReader(mf)
|
ident = PdbHeaderReader(mf).identity()
|
||||||
r0 = phead.section_data(0)
|
if ident != 'BOOKMOBI':
|
||||||
text_length = struct.unpack('>I', r0[4:8])[0]
|
raise Exception(_('Not a valid MOBI file. Reports identity of %s' % ident))
|
||||||
|
|
||||||
pages = self.get_pages(text_length)
|
# Get the pages depending on the chosen parser
|
||||||
|
pages = []
|
||||||
|
if accurate:
|
||||||
|
try:
|
||||||
|
pages = self.get_pages_accurate(mobi_file_path)
|
||||||
|
except:
|
||||||
|
# Fall back to the fast parser if we can't
|
||||||
|
# use the accurate one. Typically this is
|
||||||
|
# due to the file having DRM.
|
||||||
|
pages = self.get_pages_fast(mobi_file_path)
|
||||||
|
else:
|
||||||
|
pages = self.get_pages_fast(mobi_file_path)
|
||||||
|
|
||||||
|
if not pages:
|
||||||
|
raise Exception(_('Could not generate page mapping.'))
|
||||||
|
|
||||||
|
# Generate the APNX file from the page mapping.
|
||||||
apnx = self.generate_apnx(pages)
|
apnx = self.generate_apnx(pages)
|
||||||
|
|
||||||
|
# Write the APNX.
|
||||||
with open(apnx_path, 'wb') as apnxf:
|
with open(apnx_path, 'wb') as apnxf:
|
||||||
apnxf.write(apnx)
|
apnxf.write(apnx)
|
||||||
|
|
||||||
@ -73,18 +71,126 @@ class APNXBuilder(object):
|
|||||||
apnx += struct.pack('>H', 32)
|
apnx += struct.pack('>H', 32)
|
||||||
apnx += page_header
|
apnx += page_header
|
||||||
|
|
||||||
# write page values to apnx
|
# Write page values to APNX.
|
||||||
for page in pages:
|
for page in pages:
|
||||||
apnx += struct.pack('>L', page)
|
apnx += struct.pack('>I', page)
|
||||||
|
|
||||||
return apnx
|
return apnx
|
||||||
|
|
||||||
def get_pages(self, text_length):
|
def get_pages_fast(self, mobi_file_path):
|
||||||
|
'''
|
||||||
|
2300 characters of uncompressed text per page. This is
|
||||||
|
not meant to map 1 to 1 to a print book but to be a
|
||||||
|
close enough measure.
|
||||||
|
|
||||||
|
A test book was chosen and the characters were counted
|
||||||
|
on one page. This number was round to 2240 then 60
|
||||||
|
characters of markup were added to the total giving
|
||||||
|
2300.
|
||||||
|
|
||||||
|
Uncompressed text length is used because it's easily
|
||||||
|
accessible in MOBI files (part of the header). Also,
|
||||||
|
It's faster to work off of the length then to
|
||||||
|
decompress and parse the actual text.
|
||||||
|
'''
|
||||||
|
text_length = 0
|
||||||
pages = []
|
pages = []
|
||||||
count = 0
|
count = 0
|
||||||
|
|
||||||
|
with open(mobi_file_path, 'rb') as mf:
|
||||||
|
phead = PdbHeaderReader(mf)
|
||||||
|
r0 = phead.section_data(0)
|
||||||
|
text_length = struct.unpack('>I', r0[4:8])[0]
|
||||||
|
|
||||||
while count < text_length:
|
while count < text_length:
|
||||||
pages.append(count)
|
pages.append(count)
|
||||||
count += 2300
|
count += 2300
|
||||||
|
|
||||||
return pages
|
return pages
|
||||||
|
|
||||||
|
def get_pages_accurate(self, mobi_file_path):
|
||||||
|
'''
|
||||||
|
A more accurate but much more resource intensive and slower
|
||||||
|
method to calculate the page length.
|
||||||
|
|
||||||
|
Parses the uncompressed text. In an average paper back book
|
||||||
|
There are 32 lines per page and a maximum of 70 characters
|
||||||
|
per line.
|
||||||
|
|
||||||
|
Each paragraph starts a new line and every 70 characters
|
||||||
|
(minus markup) in a paragraph starts a new line. The
|
||||||
|
position after every 30 lines will be marked as a new
|
||||||
|
page.
|
||||||
|
|
||||||
|
This can be make more accurate by accounting for
|
||||||
|
<div class="mbp_pagebreak" /> as a new page marker.
|
||||||
|
And <br> elements as an empty line.
|
||||||
|
'''
|
||||||
|
pages = []
|
||||||
|
|
||||||
|
# Get the MOBI html.
|
||||||
|
mr = MobiReader(mobi_file_path, default_log)
|
||||||
|
if mr.book_header.encryption_type != 0:
|
||||||
|
# DRMed book
|
||||||
|
return self.get_pages_fast(mobi_file_path)
|
||||||
|
mr.extract_text()
|
||||||
|
|
||||||
|
# States
|
||||||
|
in_tag = False
|
||||||
|
in_p = False
|
||||||
|
check_p = False
|
||||||
|
closing = False
|
||||||
|
p_char_count = 0
|
||||||
|
|
||||||
|
# Get positions of every line
|
||||||
|
# A line is either a paragraph starting
|
||||||
|
# or every 70 characters in a paragraph.
|
||||||
|
lines = []
|
||||||
|
pos = -1
|
||||||
|
# We want this to be as fast as possible so we
|
||||||
|
# are going to do one pass across the text. re
|
||||||
|
# and string functions will parse the text each
|
||||||
|
# time they are called.
|
||||||
|
#
|
||||||
|
# We can can use .lower() here because we are
|
||||||
|
# not modifying the text. In this case the case
|
||||||
|
# doesn't matter just the absolute character and
|
||||||
|
# the position within the stream.
|
||||||
|
for c in mr.mobi_html.lower():
|
||||||
|
pos += 1
|
||||||
|
|
||||||
|
# Check if we are starting or stopping a p tag.
|
||||||
|
if check_p:
|
||||||
|
if c == '/':
|
||||||
|
closing = True
|
||||||
|
continue
|
||||||
|
elif c == 'p':
|
||||||
|
if closing:
|
||||||
|
in_p = False
|
||||||
|
else:
|
||||||
|
in_p = True
|
||||||
|
lines.append(pos - 2)
|
||||||
|
check_p = False
|
||||||
|
closing = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
if c == '<':
|
||||||
|
in_tag = True
|
||||||
|
check_p = True
|
||||||
|
continue
|
||||||
|
elif c == '>':
|
||||||
|
in_tag = False
|
||||||
|
check_p = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
if in_p and not in_tag:
|
||||||
|
p_char_count += 1
|
||||||
|
if p_char_count == 70:
|
||||||
|
lines.append(pos)
|
||||||
|
p_char_count = 0
|
||||||
|
|
||||||
|
# Every 30 lines is a new page
|
||||||
|
for i in xrange(0, len(lines), 32):
|
||||||
|
pages.append(lines[i])
|
||||||
|
|
||||||
|
return pages
|
||||||
|
@ -176,6 +176,28 @@ class KINDLE2(KINDLE):
|
|||||||
PRODUCT_ID = [0x0002, 0x0004]
|
PRODUCT_ID = [0x0002, 0x0004]
|
||||||
BCD = [0x0100]
|
BCD = [0x0100]
|
||||||
|
|
||||||
|
EXTRA_CUSTOMIZATION_MESSAGE = [
|
||||||
|
_('Send page number information when sending books') +
|
||||||
|
':::' +
|
||||||
|
_('The Kindle 3 and newer versions can use page number information '
|
||||||
|
'in MOBI files. With this option, calibre will calculate and send'
|
||||||
|
' this information to the Kindle when uploading MOBI files by'
|
||||||
|
' USB. Note that the page numbers do not correspond to any paper'
|
||||||
|
' book.'),
|
||||||
|
_('Use slower but more accurate page number generation') +
|
||||||
|
':::' +
|
||||||
|
_('There are two ways to generate the page number information. Using the more accurate '
|
||||||
|
'generator will produce pages that correspond better to a printed book. '
|
||||||
|
'However, this method is slower and will slow down sending files '
|
||||||
|
'to the Kindle.'),
|
||||||
|
]
|
||||||
|
EXTRA_CUSTOMIZATION_DEFAULT = [
|
||||||
|
True,
|
||||||
|
False,
|
||||||
|
]
|
||||||
|
OPT_APNX = 0
|
||||||
|
OPT_APNX_ACCURATE = 1
|
||||||
|
|
||||||
def books(self, oncard=None, end_session=True):
|
def books(self, oncard=None, end_session=True):
|
||||||
bl = USBMS.books(self, oncard=oncard, end_session=end_session)
|
bl = USBMS.books(self, oncard=oncard, end_session=end_session)
|
||||||
# Read collections information
|
# Read collections information
|
||||||
@ -212,13 +234,17 @@ class KINDLE2(KINDLE):
|
|||||||
'''
|
'''
|
||||||
Hijacking this function to write the apnx file.
|
Hijacking this function to write the apnx file.
|
||||||
'''
|
'''
|
||||||
if not filepath.lower().endswith('.mobi'):
|
opts = self.settings()
|
||||||
|
if not opts.extra_customization[self.OPT_APNX]:
|
||||||
|
return
|
||||||
|
|
||||||
|
if os.path.splitext(filepath.lower())[1] not in ('.azw', '.mobi', '.prc'):
|
||||||
return
|
return
|
||||||
|
|
||||||
apnx_path = '%s.apnx' % os.path.join(path, filename)
|
apnx_path = '%s.apnx' % os.path.join(path, filename)
|
||||||
apnx_builder = APNXBuilder()
|
apnx_builder = APNXBuilder()
|
||||||
try:
|
try:
|
||||||
apnx_builder.write_apnx(filepath, apnx_path)
|
apnx_builder.write_apnx(filepath, apnx_path, accurate=opts.extra_customization[self.OPT_APNX_ACCURATE])
|
||||||
except:
|
except:
|
||||||
print 'Failed to generate APNX'
|
print 'Failed to generate APNX'
|
||||||
import traceback
|
import traceback
|
||||||
|
Loading…
x
Reference in New Issue
Block a user