mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Added way to make custom labeled page break in APNX file using ARIA ( Accessible Rich Internet Applications) tag in html.
Using <any_html_element *** pagebreak *** aria-label="some_label" ***> will produce page break location. Possible usage <span id="pg159" role="doc-pagebreak" aria-label="159"/> <h1 id="pg_header1" role="doc-pagebreak" aria-label="Header 1">Header</h> <p role="doc-pagebreak" aria-label="§ 1 part 4 page 6 of 9">Text</h>
This commit is contained in:
parent
8f0226d8b1
commit
c6bcce78b2
@ -1,4 +1,4 @@
|
||||
__license__ = 'GPL v3'
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, John Schember <john at nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
@ -6,10 +6,8 @@ __docformat__ = 'restructuredtext en'
|
||||
Generates and writes an APNX page mapping file.
|
||||
'''
|
||||
|
||||
import re
|
||||
import struct
|
||||
|
||||
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
|
||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||
from calibre.ebooks.mobi.reader.headers import MetadataHeader
|
||||
from calibre.utils.logging import default_log
|
||||
@ -17,29 +15,69 @@ from calibre import prints, fsync
|
||||
from calibre.constants import DEBUG
|
||||
from polyglot.builtins import as_unicode, as_bytes
|
||||
|
||||
from calibre.devices.kindle.apnx_page_generator.generators.accurate_page_generator import AccuratePageGenerator
|
||||
from calibre.devices.kindle.apnx_page_generator.generators.pagebreak_page_generator import PagebreakPageGenerator
|
||||
from calibre.devices.kindle.apnx_page_generator.generators.aria_pagebreak_page_generator import \
|
||||
AriaPagebreakPageGenerator
|
||||
from calibre.devices.kindle.apnx_page_generator.generators.exact_page_generator import ExactPageGenerator
|
||||
from calibre.devices.kindle.apnx_page_generator.generators.fast_page_generator import FastPageGenerator
|
||||
from calibre.devices.kindle.apnx_page_generator.i_page_generator import IPageGenerator
|
||||
from calibre.devices.kindle.apnx_page_generator.pages import Pages
|
||||
|
||||
|
||||
class APNXBuilder:
|
||||
'''
|
||||
"""
|
||||
Create an APNX file using a pseudo page mapping.
|
||||
'''
|
||||
"""
|
||||
|
||||
def write_apnx(self, mobi_file_path, apnx_path, method=None, page_count=0):
|
||||
'''
|
||||
generators: dict[str, IPageGenerator] = {
|
||||
FastPageGenerator.instance.name(): FastPageGenerator.instance,
|
||||
AccuratePageGenerator.instance.name(): AccuratePageGenerator.instance,
|
||||
PagebreakPageGenerator.instance.name(): PagebreakPageGenerator.instance,
|
||||
AriaPagebreakPageGenerator.instance.name(): AriaPagebreakPageGenerator.instance,
|
||||
# ExactPageGenerator.instance.name(): ExactPageGenerator.instance,
|
||||
}
|
||||
|
||||
def write_apnx(self, mobi_file_path: str, apnx_path: str, method: str | None = None, page_count: int = 0):
|
||||
"""
|
||||
If you want a fixed number of pages (such as from a custom column) then
|
||||
pass in a value to page_count, otherwise a count will be estimated
|
||||
using either the fast or accurate algorithm.
|
||||
'''
|
||||
import uuid
|
||||
apnx_meta = {'guid': str(uuid.uuid4()).replace('-', '')[:8], 'asin':
|
||||
'', 'cdetype': 'EBOK', 'format': 'MOBI_7', 'acr': ''}
|
||||
"""
|
||||
apnx_meta = self.get_apnx_meta(mobi_file_path)
|
||||
|
||||
if page_count:
|
||||
generator: IPageGenerator = ExactPageGenerator.instance
|
||||
else:
|
||||
generator: IPageGenerator = self.generators.setdefault(method, FastPageGenerator.instance)
|
||||
|
||||
pages = generator.generate(mobi_file_path, page_count)
|
||||
if pages.number_of_pages == 0:
|
||||
raise Exception(_('Could not generate page mapping.'))
|
||||
# Generate the APNX file from the page mapping.
|
||||
apnx = self.generate_apnx(pages, apnx_meta)
|
||||
|
||||
# Write the APNX.
|
||||
with lopen(apnx_path, 'wb') as apnxf:
|
||||
apnxf.write(apnx)
|
||||
fsync(apnxf)
|
||||
|
||||
@staticmethod
|
||||
def get_apnx_meta(mobi_file_path) -> dict[str, str]:
|
||||
import uuid
|
||||
apnx_meta = {
|
||||
'guid': str(uuid.uuid4()).replace('-', '')[:8],
|
||||
'asin': '',
|
||||
'cdetype': 'EBOK',
|
||||
'format': 'MOBI_7',
|
||||
'acr': ''
|
||||
}
|
||||
with lopen(mobi_file_path, 'rb') as mf:
|
||||
ident = PdbHeaderReader(mf).identity()
|
||||
if as_bytes(ident) != b'BOOKMOBI':
|
||||
# Check that this is really a MOBI file.
|
||||
raise Exception(_('Not a valid MOBI file. Reports identity of %s') % ident)
|
||||
apnx_meta['acr'] = as_unicode(PdbHeaderReader(mf).name(), errors='replace')
|
||||
|
||||
# We'll need the PDB name, the MOBI version, and some metadata to make FW 3.4 happy with KF8 files...
|
||||
with lopen(mobi_file_path, 'rb') as mf:
|
||||
mh = MetadataHeader(mf, default_log)
|
||||
@ -55,41 +93,10 @@ class APNXBuilder:
|
||||
apnx_meta['asin'] = ''
|
||||
else:
|
||||
apnx_meta['asin'] = str(mh.exth.uuid)
|
||||
return apnx_meta
|
||||
|
||||
# Get the pages depending on the chosen parser
|
||||
pages = []
|
||||
if page_count:
|
||||
pages = self.get_pages_exact(mobi_file_path, page_count)
|
||||
else:
|
||||
try:
|
||||
if method == 'accurate':
|
||||
pages = self.get_pages_accurate(mobi_file_path)
|
||||
elif method == 'pagebreak':
|
||||
pages = self.get_pages_pagebreak_tag(mobi_file_path)
|
||||
if not pages:
|
||||
pages = self.get_pages_accurate(mobi_file_path)
|
||||
else:
|
||||
raise Exception('%r is not a valid apnx generation method' % method)
|
||||
except:
|
||||
# Fall back to the fast parser if we can't
|
||||
# use the accurate one. Typically this is
|
||||
# due to the file having DRM.
|
||||
pages = self.get_pages_fast(mobi_file_path)
|
||||
|
||||
if not pages:
|
||||
pages = self.get_pages_fast(mobi_file_path)
|
||||
if not pages:
|
||||
raise Exception(_('Could not generate page mapping.'))
|
||||
|
||||
# Generate the APNX file from the page mapping.
|
||||
apnx = self.generate_apnx(pages, apnx_meta)
|
||||
|
||||
# Write the APNX.
|
||||
with lopen(apnx_path, 'wb') as apnxf:
|
||||
apnxf.write(apnx)
|
||||
fsync(apnxf)
|
||||
|
||||
def generate_apnx(self, pages, apnx_meta):
|
||||
@staticmethod
|
||||
def generate_apnx(pages: Pages, apnx_meta) -> bytes:
|
||||
apnx = b''
|
||||
|
||||
if DEBUG:
|
||||
@ -107,8 +114,8 @@ class APNXBuilder:
|
||||
# legacy mobi files, too. But, since they still handle this one too, let's
|
||||
# try not to break old devices, and keep using the simple header ;).
|
||||
content_header = '{"contentGuid":"%(guid)s","asin":"%(asin)s","cdeType":"%(cdetype)s","fileRevisionId":"1"}' % apnx_meta
|
||||
page_header = '{"asin":"%(asin)s","pageMap":"(1,a,1)"}' % apnx_meta
|
||||
|
||||
page_header = '{"asin":"%(asin)s","pageMap":"' % apnx_meta
|
||||
page_header += pages.page_maps + '"}'
|
||||
if DEBUG:
|
||||
prints('APNX Content Header:', content_header)
|
||||
content_header = as_bytes(content_header)
|
||||
@ -120,177 +127,12 @@ class APNXBuilder:
|
||||
apnx += content_header
|
||||
apnx += struct.pack('>H', 1)
|
||||
apnx += struct.pack('>H', len(page_header))
|
||||
apnx += struct.pack('>H', len(pages))
|
||||
apnx += struct.pack('>H', pages.number_of_pages)
|
||||
apnx += struct.pack('>H', 32)
|
||||
apnx += page_header
|
||||
|
||||
# Write page values to APNX.
|
||||
for page in pages:
|
||||
apnx += struct.pack('>I', page)
|
||||
for location in pages.page_locations:
|
||||
apnx += struct.pack('>I', location)
|
||||
|
||||
return apnx
|
||||
|
||||
def get_pages_exact(self, mobi_file_path, page_count):
|
||||
'''
|
||||
Given a specified page count (such as from a custom column),
|
||||
create our array of pages for the apnx file by dividing by
|
||||
the content size of the book.
|
||||
'''
|
||||
pages = []
|
||||
count = 0
|
||||
|
||||
with lopen(mobi_file_path, 'rb') as mf:
|
||||
phead = PdbHeaderReader(mf)
|
||||
r0 = phead.section_data(0)
|
||||
text_length = struct.unpack('>I', r0[4:8])[0]
|
||||
|
||||
chars_per_page = int(text_length // page_count)
|
||||
while count < text_length:
|
||||
pages.append(count)
|
||||
count += chars_per_page
|
||||
|
||||
if len(pages) > page_count:
|
||||
# Rounding created extra page entries
|
||||
pages = pages[:page_count]
|
||||
|
||||
return pages
|
||||
|
||||
def get_pages_fast(self, mobi_file_path):
|
||||
'''
|
||||
2300 characters of uncompressed text per page. This is
|
||||
not meant to map 1 to 1 to a print book but to be a
|
||||
close enough measure.
|
||||
|
||||
A test book was chosen and the characters were counted
|
||||
on one page. This number was round to 2240 then 60
|
||||
characters of markup were added to the total giving
|
||||
2300.
|
||||
|
||||
Uncompressed text length is used because it's easily
|
||||
accessible in MOBI files (part of the header). Also,
|
||||
It's faster to work off of the length then to
|
||||
decompress and parse the actual text.
|
||||
'''
|
||||
text_length = 0
|
||||
pages = []
|
||||
count = 0
|
||||
|
||||
with lopen(mobi_file_path, 'rb') as mf:
|
||||
phead = PdbHeaderReader(mf)
|
||||
r0 = phead.section_data(0)
|
||||
text_length = struct.unpack('>I', r0[4:8])[0]
|
||||
|
||||
while count < text_length:
|
||||
pages.append(count)
|
||||
count += 2300
|
||||
|
||||
return pages
|
||||
|
||||
def get_pages_accurate(self, mobi_file_path):
|
||||
'''
|
||||
A more accurate but much more resource intensive and slower
|
||||
method to calculate the page length.
|
||||
|
||||
Parses the uncompressed text. In an average paper back book
|
||||
There are 32 lines per page and a maximum of 70 characters
|
||||
per line.
|
||||
|
||||
Each paragraph starts a new line and every 70 characters
|
||||
(minus markup) in a paragraph starts a new line. The
|
||||
position after every 30 lines will be marked as a new
|
||||
page.
|
||||
|
||||
This can be make more accurate by accounting for
|
||||
<div class="mbp_pagebreak" /> as a new page marker.
|
||||
And <br> elements as an empty line.
|
||||
'''
|
||||
pages = []
|
||||
|
||||
# Get the MOBI html.
|
||||
mr = MobiReader(mobi_file_path, default_log)
|
||||
if mr.book_header.encryption_type != 0:
|
||||
# DRMed book
|
||||
return self.get_pages_fast(mobi_file_path)
|
||||
mr.extract_text()
|
||||
|
||||
# States
|
||||
in_tag = False
|
||||
in_p = False
|
||||
check_p = False
|
||||
closing = False
|
||||
p_char_count = 0
|
||||
|
||||
# Get positions of every line
|
||||
# A line is either a paragraph starting
|
||||
# or every 70 characters in a paragraph.
|
||||
lines = []
|
||||
pos = -1
|
||||
# We want this to be as fast as possible so we
|
||||
# are going to do one pass across the text. re
|
||||
# and string functions will parse the text each
|
||||
# time they are called.
|
||||
#
|
||||
# We can can use .lower() here because we are
|
||||
# not modifying the text. In this case the case
|
||||
# doesn't matter just the absolute character and
|
||||
# the position within the stream.
|
||||
data = bytearray(as_bytes(mr.mobi_html.lower()))
|
||||
slash, p, lt, gt = map(ord, '/p<>')
|
||||
for c in data:
|
||||
pos += 1
|
||||
|
||||
# Check if we are starting or stopping a p tag.
|
||||
if check_p:
|
||||
if c == slash:
|
||||
closing = True
|
||||
continue
|
||||
elif c == p:
|
||||
if closing:
|
||||
in_p = False
|
||||
else:
|
||||
in_p = True
|
||||
lines.append(pos - 2)
|
||||
check_p = False
|
||||
closing = False
|
||||
continue
|
||||
|
||||
if c == lt:
|
||||
in_tag = True
|
||||
check_p = True
|
||||
continue
|
||||
elif c == gt:
|
||||
in_tag = False
|
||||
check_p = False
|
||||
continue
|
||||
|
||||
if in_p and not in_tag:
|
||||
p_char_count += 1
|
||||
if p_char_count == 70:
|
||||
lines.append(pos)
|
||||
p_char_count = 0
|
||||
|
||||
# Every 30 lines is a new page
|
||||
for i in range(0, len(lines), 32):
|
||||
pages.append(lines[i])
|
||||
|
||||
return pages
|
||||
|
||||
def get_pages_pagebreak_tag(self, mobi_file_path):
|
||||
'''
|
||||
Determine pages based on the presence of
|
||||
<mbp:pagebreak>.
|
||||
'''
|
||||
pages = []
|
||||
|
||||
# Get the MOBI html.
|
||||
mr = MobiReader(mobi_file_path, default_log)
|
||||
if mr.book_header.encryption_type != 0:
|
||||
# DRMed book
|
||||
return self.get_pages_fast(mobi_file_path)
|
||||
mr.extract_text()
|
||||
|
||||
html = as_bytes(mr.mobi_html.lower())
|
||||
for m in re.finditer(b'<[^>]*pagebreak[^>]*>', html):
|
||||
pages.append(m.end())
|
||||
|
||||
return pages
|
||||
|
@ -0,0 +1,103 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2022, Vaso Peras-Likodric <vaso at vipl.in.rs>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.devices.kindle.apnx_page_generator.generators.fast_page_generator import FastPageGenerator
|
||||
from calibre.devices.kindle.apnx_page_generator.i_page_generator import IPageGenerator
|
||||
from calibre.devices.kindle.apnx_page_generator.pages import Pages
|
||||
|
||||
|
||||
class AccuratePageGenerator(IPageGenerator):
|
||||
|
||||
def name(self) -> str:
|
||||
return "accurate"
|
||||
|
||||
def _generate_fallback(self, mobi_file_path: str, real_count: int | None) -> Pages:
|
||||
return FastPageGenerator.instance.generate(mobi_file_path, real_count)
|
||||
|
||||
def _generate(self, mobi_file_path: str, real_count: int | None) -> Pages:
|
||||
"""
|
||||
A more accurate but much more resource intensive and slower
|
||||
method to calculate the page length.
|
||||
|
||||
Parses the uncompressed text. In an average paper back book
|
||||
There are 32 lines per page and a maximum of 70 characters
|
||||
per line.
|
||||
|
||||
Each paragraph starts a new line and every 70 characters
|
||||
(minus markup) in a paragraph starts a new line. The
|
||||
position after every 30 lines will be marked as a new
|
||||
page.
|
||||
|
||||
This can be make more accurate by accounting for
|
||||
<div class="mbp_pagebreak" /> as a new page marker.
|
||||
And <br> elements as an empty line.
|
||||
"""
|
||||
pages = []
|
||||
|
||||
html = self.mobi_html(mobi_file_path)
|
||||
|
||||
# States
|
||||
in_tag = False
|
||||
in_p = False
|
||||
check_p = False
|
||||
closing = False
|
||||
p_char_count = 0
|
||||
|
||||
# Get positions of every line
|
||||
# A line is either a paragraph starting
|
||||
# or every 70 characters in a paragraph.
|
||||
lines = []
|
||||
pos = -1
|
||||
# We want this to be as fast as possible so we
|
||||
# are going to do one pass across the text. re
|
||||
# and string functions will parse the text each
|
||||
# time they are called.
|
||||
#
|
||||
# We can can use .lower() here because we are
|
||||
# not modifying the text. In this case the case
|
||||
# doesn't matter just the absolute character and
|
||||
# the position within the stream.
|
||||
data = bytearray(html)
|
||||
slash, p, lt, gt = map(ord, '/p<>')
|
||||
for c in data:
|
||||
pos += 1
|
||||
|
||||
# Check if we are starting or stopping a p tag.
|
||||
if check_p:
|
||||
if c == slash:
|
||||
closing = True
|
||||
continue
|
||||
elif c == p:
|
||||
if closing:
|
||||
in_p = False
|
||||
else:
|
||||
in_p = True
|
||||
lines.append(pos - 2)
|
||||
check_p = False
|
||||
closing = False
|
||||
continue
|
||||
|
||||
if c == lt:
|
||||
in_tag = True
|
||||
check_p = True
|
||||
continue
|
||||
elif c == gt:
|
||||
in_tag = False
|
||||
check_p = False
|
||||
continue
|
||||
|
||||
if in_p and not in_tag:
|
||||
p_char_count += 1
|
||||
if p_char_count == 70:
|
||||
lines.append(pos)
|
||||
p_char_count = 0
|
||||
|
||||
# Every 30 lines is a new page
|
||||
for i in range(0, len(lines), 32):
|
||||
pages.append(lines[i])
|
||||
|
||||
return Pages(pages)
|
||||
|
||||
|
||||
AccuratePageGenerator.instance = AccuratePageGenerator()
|
@ -0,0 +1,84 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2022, Vaso Peras-Likodric <vaso at vipl.in.rs>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.devices.kindle.apnx_page_generator.generators.fast_page_generator import FastPageGenerator
|
||||
from calibre.devices.kindle.apnx_page_generator.i_page_generator import IPageGenerator
|
||||
from calibre.devices.kindle.apnx_page_generator.page_number_type import PageNumberTypes
|
||||
from calibre.devices.kindle.apnx_page_generator.pages import Pages
|
||||
from calibre.devices.kindle.apnx_page_generator.page_group import PageGroup
|
||||
import re
|
||||
|
||||
roman_numeral_map = (('m', 1000), ('cm', 900), ('d', 500), ('cd', 400), ('c', 100), ('xc', 90), ('l', 50), ('xl', 40),
|
||||
('x', 10), ('ix', 9), ('v', 5), ('iv', 4), ('i', 1))
|
||||
|
||||
roman_numeral_pattern = re.compile("""^m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|V?i{0,3})$""", re.VERBOSE)
|
||||
|
||||
|
||||
def from_roman(s: str) -> int:
|
||||
"""convert Roman numeral to integer"""
|
||||
if not s:
|
||||
raise ValueError('Input can not be blank')
|
||||
if not roman_numeral_pattern.match(s):
|
||||
raise ValueError('Invalid Roman numeral: %s' % s)
|
||||
|
||||
result = 0
|
||||
index = 0
|
||||
for numeral, integer in roman_numeral_map:
|
||||
while s[index:index + len(numeral)] == numeral:
|
||||
result += integer
|
||||
index += len(numeral)
|
||||
return result
|
||||
|
||||
|
||||
class LabelDescriptor:
|
||||
def __init__(self, label: str, value: int, label_type: PageNumberTypes):
|
||||
self.label: str = label
|
||||
self.value: int = value
|
||||
self.label_type: PageNumberTypes = label_type
|
||||
|
||||
|
||||
class AriaPagebreakPageGenerator(IPageGenerator):
|
||||
|
||||
def name(self) -> str:
|
||||
return "aria_pagebreak"
|
||||
|
||||
def _generate_fallback(self, mobi_file_path: str, real_count: int | None) -> Pages:
|
||||
return FastPageGenerator.instance.generate(mobi_file_path, real_count)
|
||||
|
||||
def _generate(self, mobi_file_path: str, real_count: int | None) -> Pages:
|
||||
html = self.mobi_html(mobi_file_path)
|
||||
pages = Pages()
|
||||
|
||||
for m in re.finditer(b'<[^>]*role="doc-pagebreak"[^>]*aria-label="([^"|]+)"[^>]*>', html):
|
||||
label_descriptor = self.get_label(m.group(1))
|
||||
if pages.number_of_pages == 0:
|
||||
pages.append(PageGroup(m.end(), label_descriptor.label_type, label_descriptor.value,
|
||||
label_descriptor.label))
|
||||
elif (
|
||||
pages.last_group.last_value == label_descriptor.value - 1 or label_descriptor.label_type ==
|
||||
PageNumberTypes.Custom) and pages.last_group.page_number_types == label_descriptor.label_type:
|
||||
|
||||
if label_descriptor.label_type != PageNumberTypes.Custom:
|
||||
pages.last_group.append(m.end())
|
||||
else:
|
||||
pages.last_group.append((m.end(), label_descriptor.label))
|
||||
else:
|
||||
pages.append(PageGroup(m.end(), label_descriptor.label_type, label_descriptor.value,
|
||||
label_descriptor.label))
|
||||
|
||||
return pages
|
||||
|
||||
@staticmethod
|
||||
def get_label(label: bytes) -> LabelDescriptor:
|
||||
label_string = label.decode()
|
||||
try:
|
||||
return LabelDescriptor(label_string, int(label_string), PageNumberTypes.Arabic)
|
||||
except ValueError:
|
||||
try:
|
||||
return LabelDescriptor(label_string, from_roman(label_string), PageNumberTypes.Roman)
|
||||
except ValueError:
|
||||
return LabelDescriptor(label_string, 0, PageNumberTypes.Custom)
|
||||
|
||||
|
||||
AriaPagebreakPageGenerator.instance = AriaPagebreakPageGenerator()
|
@ -0,0 +1,41 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2022, Vaso Peras-Likodric <vaso at vipl.in.rs>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.devices.kindle.apnx_page_generator.generators.fast_page_generator import FastPageGenerator
|
||||
from calibre.devices.kindle.apnx_page_generator.i_page_generator import IPageGenerator
|
||||
from calibre.devices.kindle.apnx_page_generator.pages import Pages
|
||||
|
||||
|
||||
class ExactPageGenerator(IPageGenerator):
|
||||
|
||||
def name(self) -> str:
|
||||
return "exact"
|
||||
|
||||
def _generate_fallback(self, mobi_file_path: str, real_count: int | None) -> Pages:
|
||||
return FastPageGenerator.instance.generate(mobi_file_path, real_count)
|
||||
|
||||
def _generate(self, mobi_file_path: str, real_count: int | None) -> Pages:
|
||||
"""
|
||||
Given a specified page count (such as from a custom column),
|
||||
create our array of pages for the apnx file by dividing by
|
||||
the content size of the book.
|
||||
"""
|
||||
pages = []
|
||||
count = 0
|
||||
|
||||
text_length = self.mobi_html_length(mobi_file_path)
|
||||
|
||||
chars_per_page = int(text_length // real_count)
|
||||
while count < text_length:
|
||||
pages.append(count)
|
||||
count += chars_per_page
|
||||
|
||||
if len(pages) > real_count:
|
||||
# Rounding created extra page entries
|
||||
pages = pages[:real_count]
|
||||
|
||||
return Pages(pages)
|
||||
|
||||
|
||||
ExactPageGenerator.instance = ExactPageGenerator()
|
@ -0,0 +1,46 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2022, Vaso Peras-Likodric <vaso at vipl.in.rs>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.devices.kindle.apnx_page_generator.i_page_generator import IPageGenerator
|
||||
from calibre.devices.kindle.apnx_page_generator.pages import Pages
|
||||
|
||||
|
||||
class FastPageGenerator(IPageGenerator):
|
||||
|
||||
def name(self) -> str:
|
||||
return "fast"
|
||||
|
||||
def _generate_fallback(self, mobi_file_path: str, real_count: int | None) -> Pages:
|
||||
raise Exception("Fast calculation impossible.")
|
||||
|
||||
def _generate(self, mobi_file_path: str, real_count: int | None) -> Pages:
|
||||
"""
|
||||
2300 characters of uncompressed text per page. This is
|
||||
not meant to map 1 to 1 to a print book but to be a
|
||||
close enough measure.
|
||||
|
||||
A test book was chosen and the characters were counted
|
||||
on one page. This number was round to 2240 then 60
|
||||
characters of markup were added to the total giving
|
||||
2300.
|
||||
|
||||
Uncompressed text length is used because it's easily
|
||||
accessible in MOBI files (part of the header). Also,
|
||||
It's faster to work off of the length then to
|
||||
decompress and parse the actual text.
|
||||
"""
|
||||
|
||||
pages = []
|
||||
count = 0
|
||||
|
||||
text_length = self.mobi_html_length(mobi_file_path)
|
||||
|
||||
while count < text_length:
|
||||
pages.append(count)
|
||||
count += 2300
|
||||
|
||||
return Pages(pages)
|
||||
|
||||
|
||||
FastPageGenerator.instance = FastPageGenerator()
|
@ -0,0 +1,29 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2022, Vaso Peras-Likodric <vaso at vipl.in.rs>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.devices.kindle.apnx_page_generator.generators.fast_page_generator import FastPageGenerator
|
||||
from calibre.devices.kindle.apnx_page_generator.i_page_generator import IPageGenerator
|
||||
from calibre.devices.kindle.apnx_page_generator.pages import Pages
|
||||
import re
|
||||
|
||||
|
||||
class PagebreakPageGenerator(IPageGenerator):
|
||||
|
||||
def name(self) -> str:
|
||||
return "pagebreak"
|
||||
|
||||
def _generate_fallback(self, mobi_file_path: str, real_count: int | None) -> Pages:
|
||||
return FastPageGenerator.instance.generate(mobi_file_path, real_count)
|
||||
|
||||
def _generate(self, mobi_file_path: str, real_count: int | None) -> Pages:
|
||||
""" Determine pages based on the presence of <*pagebreak*/>. """
|
||||
html = self.mobi_html(mobi_file_path)
|
||||
pages = []
|
||||
for m in re.finditer(b'<[^>]*pagebreak[^>]*>', html):
|
||||
pages.append(m.end())
|
||||
|
||||
return Pages(pages)
|
||||
|
||||
|
||||
PagebreakPageGenerator.instance = PagebreakPageGenerator()
|
@ -0,0 +1,53 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2022, Vaso Peras-Likodric <vaso at vipl.in.rs>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import struct
|
||||
from abc import abstractmethod, ABCMeta
|
||||
from calibre.devices.kindle.apnx_page_generator.pages import Pages
|
||||
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
|
||||
from calibre.utils.logging import default_log
|
||||
from polyglot.builtins import as_bytes
|
||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||
|
||||
|
||||
class IPageGenerator(metaclass=ABCMeta):
|
||||
|
||||
@abstractmethod
|
||||
def _generate(self, mobi_file_path: str, real_count: int | None) -> Pages:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _generate_fallback(self, mobi_file_path: str, real_count: int | None) -> Pages:
|
||||
pass
|
||||
|
||||
def generate(self, mobi_file_path: str, real_count: int | None) -> Pages:
|
||||
try:
|
||||
result = self._generate(mobi_file_path, real_count)
|
||||
if result.number_of_pages > 0:
|
||||
return result
|
||||
return self._generate_fallback(mobi_file_path, real_count)
|
||||
except Exception as e:
|
||||
if self.__class__.__name__ == "FastPageGenerator":
|
||||
raise e
|
||||
return self._generate_fallback(mobi_file_path, real_count)
|
||||
|
||||
@abstractmethod
|
||||
def name(self) -> str:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def mobi_html(mobi_file_path: str) -> bytes:
|
||||
mr = MobiReader(mobi_file_path, default_log)
|
||||
if mr.book_header.encryption_type != 0:
|
||||
raise Exception("DRMed book")
|
||||
mr.extract_text()
|
||||
return as_bytes(mr.mobi_html.lower())
|
||||
|
||||
@staticmethod
|
||||
def mobi_html_length(mobi_file_path: str) -> int:
|
||||
with lopen(mobi_file_path, 'rb') as mf:
|
||||
pdb_header = PdbHeaderReader(mf)
|
||||
r0 = pdb_header.section_data(0)
|
||||
return struct.unpack('>I', r0[4:8])[0]
|
||||
|
55
src/calibre/devices/kindle/apnx_page_generator/page_group.py
Normal file
55
src/calibre/devices/kindle/apnx_page_generator/page_group.py
Normal file
@ -0,0 +1,55 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2022, Vaso Peras-Likodric <vaso at vipl.in.rs>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.devices.kindle.apnx_page_generator.page_number_type import PageNumberTypes
|
||||
|
||||
|
||||
class PageGroup:
|
||||
"""Simulate constructor overloading"""
|
||||
def __init__(self, page_locations: int | list[int], page_number_type: PageNumberTypes, first_value: int,
|
||||
page_labels: str | list[str] | None = None):
|
||||
if page_locations.__class__ == int:
|
||||
self.page_locations: list[int] = [page_locations]
|
||||
else:
|
||||
self.page_locations: list[int] = page_locations
|
||||
self.__page_number_type: PageNumberTypes = page_number_type
|
||||
self.__first_value = first_value
|
||||
if page_number_type == PageNumberTypes.Custom:
|
||||
assert(page_labels is not None)
|
||||
if page_labels.__class__ == str:
|
||||
assert (1 == len(self.page_locations) and len(page_labels) > 0)
|
||||
self.__page_number_labels: list[str] = [page_labels]
|
||||
else:
|
||||
assert (len(page_labels) == len(self.page_locations))
|
||||
assert(all(len(label) > 0 for label in page_labels))
|
||||
self.__page_number_labels: list[str] = page_labels
|
||||
|
||||
def append(self, page_location: int | tuple[int, str]) -> None:
|
||||
if page_location.__class__ == int:
|
||||
assert (self.__page_number_type != PageNumberTypes.Custom)
|
||||
self.page_locations.append(page_location)
|
||||
else:
|
||||
assert (self.__page_number_type == PageNumberTypes.Custom)
|
||||
self.page_locations.append(page_location[0])
|
||||
self.__page_number_labels.append(page_location[1])
|
||||
return
|
||||
|
||||
@property
|
||||
def page_number_types(self) -> PageNumberTypes:
|
||||
return self.__page_number_type
|
||||
|
||||
@property
|
||||
def number_of_pages(self) -> int:
|
||||
return len(self.page_locations)
|
||||
|
||||
@property
|
||||
def last_value(self) -> int:
|
||||
return self.__first_value + len(self.page_locations) - 1
|
||||
|
||||
def get_page_map(self, starting_location: int) -> str:
|
||||
if self.__page_number_type != PageNumberTypes.Custom:
|
||||
values = str(self.__first_value)
|
||||
else:
|
||||
values = "|".join(self.__page_number_labels)
|
||||
return "(%s,%s,%s)" % (starting_location, self.__page_number_type.value, values)
|
@ -0,0 +1,11 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2022, Vaso Peras-Likodric <vaso at vipl.in.rs>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import enum
|
||||
|
||||
|
||||
class PageNumberTypes(str, enum.Enum):
|
||||
Arabic = "a"
|
||||
Roman = "r"
|
||||
Custom = 'c'
|
43
src/calibre/devices/kindle/apnx_page_generator/pages.py
Normal file
43
src/calibre/devices/kindle/apnx_page_generator/pages.py
Normal file
@ -0,0 +1,43 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2022, Vaso Peras-Likodric <vaso at vipl.in.rs>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import itertools
|
||||
|
||||
from calibre.devices.kindle.apnx_page_generator.page_group import PageGroup
|
||||
from calibre.devices.kindle.apnx_page_generator.page_number_type import PageNumberTypes
|
||||
|
||||
|
||||
class Pages:
|
||||
def __init__(self, page_locations: list[int] | None = None):
|
||||
if page_locations.__class__ == list:
|
||||
self.__pages_groups: list[PageGroup] = [PageGroup(page_locations, PageNumberTypes.Arabic, 1)]
|
||||
else:
|
||||
self.__pages_groups: list[PageGroup] = []
|
||||
|
||||
def append(self, page_location: PageGroup) -> None:
|
||||
self.__pages_groups.append(page_location)
|
||||
return
|
||||
|
||||
@property
|
||||
def last_group(self) -> PageGroup:
|
||||
return self.__pages_groups[-1]
|
||||
|
||||
@property
|
||||
def page_maps(self) -> str:
|
||||
location = 1
|
||||
result = []
|
||||
for group in self.__pages_groups:
|
||||
result.append(group.get_page_map(location))
|
||||
location += group.number_of_pages
|
||||
return ",".join(result)
|
||||
|
||||
@property
|
||||
def page_locations(self) -> list[int]:
|
||||
return list(itertools.chain.from_iterable(list(map(lambda pg: pg.page_locations, self.__pages_groups))))
|
||||
|
||||
@property
|
||||
def number_of_pages(self) -> int:
|
||||
return sum(list(map(lambda pg: len(pg.page_locations), self.__pages_groups)))
|
||||
|
||||
|
@ -2,6 +2,8 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john at nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.devices.kindle.apnx import APNXBuilder
|
||||
|
||||
'''
|
||||
Device driver for Amazon's Kindle
|
||||
'''
|
||||
@ -409,7 +411,7 @@ class KINDLE2(KINDLE):
|
||||
OPT_APNX_CUST_COL = 2
|
||||
OPT_APNX_METHOD_COL = 3
|
||||
OPT_APNX_OVERWRITE = 4
|
||||
EXTRA_CUSTOMIZATION_CHOICES = {OPT_APNX_METHOD:{'fast', 'accurate', 'pagebreak'}}
|
||||
EXTRA_CUSTOMIZATION_CHOICES = {OPT_APNX_METHOD: APNXBuilder.generators.keys()}
|
||||
|
||||
# x330 on the PaperWhite
|
||||
# x262 on the Touch. Doesn't choke on x330, though.
|
||||
|
Loading…
x
Reference in New Issue
Block a user