mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge from trunk
This commit is contained in:
commit
ab19edb96f
@ -3,10 +3,11 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
title = u'FHM UK'
|
||||
description = 'Good News for Men'
|
||||
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
|
||||
cover_url = 'http://www.greatmagazines.co.uk/covers/large/w197/current/fhm.jpg'
|
||||
# cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
|
||||
masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
|
||||
__author__ = 'Dave Asbury'
|
||||
# last updated 27/1/12
|
||||
# last updated 17/3/12
|
||||
language = 'en_GB'
|
||||
oldest_article = 28
|
||||
max_articles_per_feed = 12
|
||||
@ -29,6 +30,8 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
feeds = [
|
||||
(u'From the Homepage',u'http://feed43.com/8053226782885416.xml'),
|
||||
(u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'),
|
||||
(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'),
|
||||
(u'Gaming',u'http://feed43.com/0755006465351035.xml'),
|
||||
]
|
||||
(u'Upgrade',u'http://feed43.com/0877305847443234.xml'),
|
||||
#(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'),
|
||||
#(u'Gaming',u'http://feed43.com/0755006465351035.xml'),
|
||||
(u'Gaming',u'http://feed43.com/6537162612465672.xml'),
|
||||
]
|
||||
|
43
recipes/ivanamilakovic.recipe
Normal file
43
recipes/ivanamilakovic.recipe
Normal file
@ -0,0 +1,43 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
ivanamilakovic.blogspot.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class IvanaMilakovic(BasicNewsRecipe):
|
||||
title = u'Ivana Milaković'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = u'Hronika mačijeg škrabala - priče, inspiracija, knjige, pisanje, prevodi...'
|
||||
oldest_article = 80
|
||||
max_articles_per_feed = 100
|
||||
language = 'sr'
|
||||
encoding = 'utf-8'
|
||||
no_stylesheets = True
|
||||
use_embedded_content = True
|
||||
publication_type = 'blog'
|
||||
extra_css = """
|
||||
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||
body{font-family: Arial,Tahoma,Helvetica,FreeSans,sans1,sans-serif}
|
||||
img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px }
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : 'knjige, blog, srbija, sf'
|
||||
, 'publisher': 'Ivana Milakovic'
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
|
||||
feeds = [(u'Posts', u'http://ivanamilakovic.blogspot.com/feeds/posts/default')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return self.adeify_images(soup)
|
42
recipes/klubknjige.recipe
Normal file
42
recipes/klubknjige.recipe
Normal file
@ -0,0 +1,42 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
klub-knjige.blogspot.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class KlubKnjige(BasicNewsRecipe):
|
||||
title = 'Klub knjige'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'literarni blog'
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 100
|
||||
language = 'sr'
|
||||
encoding = 'utf-8'
|
||||
no_stylesheets = True
|
||||
use_embedded_content = True
|
||||
publication_type = 'blog'
|
||||
extra_css = """
|
||||
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||
body{font-family: Arial,Tahoma,Helvetica,FreeSans,sans1,sans-serif}
|
||||
img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px }
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : 'knjige, blog, srbija, sf'
|
||||
, 'publisher': 'Klub Knjige'
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
|
||||
feeds = [(u'Posts', u'http://klub-knjige.blogspot.com/feeds/posts/default')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return self.adeify_images(soup)
|
@ -3,7 +3,6 @@ __copyright__ = '2011'
|
||||
'''
|
||||
lemonde.fr
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class LeMonde(BasicNewsRecipe):
|
||||
@ -41,77 +40,8 @@ class LeMonde(BasicNewsRecipe):
|
||||
|
||||
remove_empty_feeds = True
|
||||
|
||||
filterDuplicates = True
|
||||
auto_cleanup = True
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return self.adeify_images(soup)
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'([0-9])%'), lambda m: m.group(1) + ' %'),
|
||||
(re.compile(r'([0-9])([0-9])([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + m.group(2) + m.group(3) + ' ' + m.group(4) + m.group(5) + m.group(6)),
|
||||
(re.compile(r'([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + ' ' + m.group(2) + m.group(3) + m.group(4)),
|
||||
(re.compile(r'<span>'), lambda match: ' <span>'),
|
||||
(re.compile(r'\("'), lambda match: '(« '),
|
||||
(re.compile(r'"\)'), lambda match: ' »)'),
|
||||
(re.compile(r'“'), lambda match: '(« '),
|
||||
(re.compile(r'”'), lambda match: ' »)'),
|
||||
(re.compile(r'>\''), lambda match: '>‘'),
|
||||
(re.compile(r' \''), lambda match: ' ‘'),
|
||||
(re.compile(r'\''), lambda match: '’'),
|
||||
(re.compile(r'"<em>'), lambda match: '<em>« '),
|
||||
(re.compile(r'"<em>"</em><em>'), lambda match: '<em>« '),
|
||||
(re.compile(r'"<a href='), lambda match: '« <a href='),
|
||||
(re.compile(r'</em>"'), lambda match: ' »</em>'),
|
||||
(re.compile(r'</a>"'), lambda match: ' »</a>'),
|
||||
(re.compile(r'"</'), lambda match: ' »</'),
|
||||
(re.compile(r'>"'), lambda match: '>« '),
|
||||
(re.compile(r'"<'), lambda match: ' »<'),
|
||||
(re.compile(r'’"'), lambda match: '’« '),
|
||||
(re.compile(r' "'), lambda match: ' « '),
|
||||
(re.compile(r'" '), lambda match: ' » '),
|
||||
(re.compile(r'"\.'), lambda match: ' ».'),
|
||||
(re.compile(r'",'), lambda match: ' »,'),
|
||||
(re.compile(r'"\?'), lambda match: ' »?'),
|
||||
(re.compile(r'":'), lambda match: ' »:'),
|
||||
(re.compile(r'";'), lambda match: ' »;'),
|
||||
(re.compile(r'"\!'), lambda match: ' »!'),
|
||||
(re.compile(r' :'), lambda match: ' :'),
|
||||
(re.compile(r' ;'), lambda match: ' ;'),
|
||||
(re.compile(r' \?'), lambda match: ' ?'),
|
||||
(re.compile(r' \!'), lambda match: ' !'),
|
||||
(re.compile(r'\s»'), lambda match: ' »'),
|
||||
(re.compile(r'«\s'), lambda match: '« '),
|
||||
(re.compile(r' %'), lambda match: ' %'),
|
||||
(re.compile(r'\.jpg » border='), lambda match: '.jpg'),
|
||||
(re.compile(r'\.png » border='), lambda match: '.png'),
|
||||
(re.compile(r' – '), lambda match: ' – '),
|
||||
(re.compile(r' – '), lambda match: ' – '),
|
||||
(re.compile(r' - '), lambda match: ' – '),
|
||||
(re.compile(r' -,'), lambda match: ' –,'),
|
||||
(re.compile(r'»:'), lambda match: '» :'),
|
||||
]
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['contenu']})
|
||||
]
|
||||
remove_tags = [dict(name='div', attrs={'class':['LM_atome']})]
|
||||
remove_tags_after = [dict(id='appel_temoignage')]
|
||||
|
||||
def get_article_url(self, article):
|
||||
url = article.get('guid', None)
|
||||
if '/chat/' in url or '.blog' in url or '/video/' in url or '/sport/' in url or '/portfolio/' in url or '/visuel/' in url :
|
||||
url = None
|
||||
return url
|
||||
|
||||
# def get_article_url(self, article):
|
||||
# link = article.get('link')
|
||||
# if 'blog' not in link and ('chat' not in link):
|
||||
# return link
|
||||
|
||||
feeds = [
|
||||
('A la une', 'http://www.lemonde.fr/rss/une.xml'),
|
||||
@ -137,3 +67,10 @@ class LeMonde(BasicNewsRecipe):
|
||||
|
||||
return cover_url
|
||||
|
||||
def get_article_url(self, article):
|
||||
url = article.get('guid', None)
|
||||
if '/chat/' in url or '.blog' in url or '/video/' in url or '/sport/' in url or '/portfolio/' in url or '/visuel/' in url :
|
||||
url = None
|
||||
return url
|
||||
|
||||
|
||||
|
@ -14,7 +14,7 @@ from setup.build_environment import msvc, MT, RC
|
||||
from setup.installer.windows.wix import WixMixIn
|
||||
|
||||
OPENSSL_DIR = r'Q:\openssl'
|
||||
QT_DIR = 'Q:\\Qt\\4.7.3'
|
||||
QT_DIR = 'Q:\\Qt\\4.8.0'
|
||||
QT_DLLS = ['Core', 'Gui', 'Network', 'Svg', 'WebKit', 'Xml', 'XmlPatterns']
|
||||
LIBUNRAR = 'C:\\Program Files\\UnrarDLL\\unrar.dll'
|
||||
SW = r'C:\cygwin\home\kovid\sw'
|
||||
|
@ -97,7 +97,9 @@ Now, run configure and make::
|
||||
|
||||
-no-plugin-manifests is needed so that loading the plugins does not fail looking for the CRT assembly
|
||||
|
||||
configure -opensource -release -qt-zlib -qt-gif -qt-libmng -qt-libpng -qt-libtiff -qt-libjpeg -release -platform win32-msvc2008 -no-qt3support -webkit -xmlpatterns -no-phonon -no-style-plastique -no-style-cleanlooks -no-style-motif -no-style-cde -no-declarative -no-scripttools -no-audio-backend -no-multimedia -no-dbus -no-openvg -no-opengl -no-qt3support -confirm-license -nomake examples -nomake demos -nomake docs -no-plugin-manifests -openssl -I Q:\openssl\include -L Q:\openssl\lib && nmake
|
||||
configure -opensource -release -qt-zlib -qt-libmng -qt-libpng -qt-libtiff -qt-libjpeg -release -platform win32-msvc2008 -no-qt3support -webkit -xmlpatterns -no-phonon -no-style-plastique -no-style-cleanlooks -no-style-motif -no-style-cde -no-declarative -no-scripttools -no-audio-backend -no-multimedia -no-dbus -no-openvg -no-opengl -no-qt3support -confirm-license -nomake examples -nomake demos -nomake docs -no-plugin-manifests -openssl -I Q:\openssl\include -L Q:\openssl\lib && nmake
|
||||
|
||||
Add the path to the bin folder inside the Qt dir to your system PATH.
|
||||
|
||||
SIP
|
||||
-----
|
||||
|
@ -381,12 +381,15 @@ def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None):
|
||||
user_agent = USER_AGENT_MOBILE if mobile_browser else USER_AGENT
|
||||
opener.addheaders = [('User-agent', user_agent)]
|
||||
proxies = get_proxies()
|
||||
to_add = {}
|
||||
http_proxy = proxies.get('http', None)
|
||||
if http_proxy:
|
||||
opener.set_proxies({'http':http_proxy})
|
||||
to_add['http'] = http_proxy
|
||||
https_proxy = proxies.get('https', None)
|
||||
if https_proxy:
|
||||
opener.set_proxies({'https':https_proxy})
|
||||
to_add['https'] = https_proxy
|
||||
if to_add:
|
||||
opener.set_proxies(to_add)
|
||||
|
||||
return opener
|
||||
|
||||
|
@ -625,7 +625,8 @@ from calibre.devices.eb600.driver import (EB600, COOL_ER, SHINEBOOK,
|
||||
POCKETBOOK701, POCKETBOOK360P, PI2)
|
||||
from calibre.devices.iliad.driver import ILIAD
|
||||
from calibre.devices.irexdr.driver import IREXDR1000, IREXDR800
|
||||
from calibre.devices.jetbook.driver import JETBOOK, MIBUK, JETBOOK_MINI
|
||||
from calibre.devices.jetbook.driver import (JETBOOK, MIBUK, JETBOOK_MINI,
|
||||
JETBOOK_COLOR)
|
||||
from calibre.devices.kindle.driver import (KINDLE, KINDLE2, KINDLE_DX,
|
||||
KINDLE_FIRE)
|
||||
from calibre.devices.nook.driver import NOOK, NOOK_COLOR
|
||||
@ -664,9 +665,7 @@ plugins += [
|
||||
ILIAD,
|
||||
IREXDR1000,
|
||||
IREXDR800,
|
||||
JETBOOK,
|
||||
JETBOOK_MINI,
|
||||
MIBUK,
|
||||
JETBOOK, JETBOOK_MINI, MIBUK, JETBOOK_COLOR,
|
||||
SHINEBOOK,
|
||||
POCKETBOOK360, POCKETBOOK301, POCKETBOOK602, POCKETBOOK701, POCKETBOOK360P,
|
||||
PI2,
|
||||
|
@ -234,7 +234,7 @@ def main(args=sys.argv):
|
||||
sql_dump = args[-1]
|
||||
reinit_db(opts.reinitialize_db, sql_dump=sql_dump)
|
||||
elif opts.inspect_mobi:
|
||||
from calibre.ebooks.mobi.debug import inspect_mobi
|
||||
from calibre.ebooks.mobi.debug.main import inspect_mobi
|
||||
for path in args[1:]:
|
||||
prints('Inspecting:', path)
|
||||
inspect_mobi(path)
|
||||
|
@ -125,4 +125,29 @@ class JETBOOK_MINI(USBMS):
|
||||
|
||||
SUPPORTS_SUB_DIRS = True
|
||||
|
||||
class JETBOOK_COLOR(USBMS):
|
||||
|
||||
'''
|
||||
set([(u'0x951',
|
||||
u'0x160b',
|
||||
u'0x0',
|
||||
u'Freescale',
|
||||
u'Mass Storage Device',
|
||||
u'0802270905553')])
|
||||
'''
|
||||
|
||||
FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'rtf', 'txt', 'pdf', 'djvu']
|
||||
|
||||
gui_name = 'JetBook Color'
|
||||
name = 'JetBook Color Device Interface'
|
||||
description = _('Communicate with the JetBook Color reader.')
|
||||
author = 'Kovid Goyal'
|
||||
|
||||
VENDOR_ID = [0x951]
|
||||
PRODUCT_ID = [0x160b]
|
||||
BCD = [0x0]
|
||||
EBOOK_DIR_MAIN = 'My Books'
|
||||
|
||||
SUPPORTS_SUB_DIRS = True
|
||||
|
||||
|
||||
|
@ -27,7 +27,7 @@ class PRS505(USBMS):
|
||||
booklist_class = CollectionsBookList
|
||||
|
||||
|
||||
FORMATS = ['epub', 'lrf', 'lrx', 'rtf', 'pdf', 'txt']
|
||||
FORMATS = ['epub', 'lrf', 'lrx', 'rtf', 'pdf', 'txt', 'zbf']
|
||||
CAN_SET_METADATA = ['title', 'authors', 'collections']
|
||||
CAN_DO_DEVICE_DB_PLUGBOARD = True
|
||||
|
||||
|
@ -190,12 +190,22 @@ class EPUBOutput(OutputFormatPlugin):
|
||||
if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode(x).startswith('urn:uuid:'):
|
||||
uuid = unicode(x).split(':')[-1]
|
||||
break
|
||||
encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
|
||||
|
||||
if uuid is None:
|
||||
self.log.warn('No UUID identifier found')
|
||||
from uuid import uuid4
|
||||
uuid = str(uuid4())
|
||||
oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid)
|
||||
|
||||
if encrypted_fonts and not uuid.startswith('urn:uuid:'):
|
||||
# Apparently ADE requires this value to start with urn:uuid:
|
||||
# for some absurd reason, or it will throw a hissy fit and refuse
|
||||
# to use the obfuscated fonts.
|
||||
for x in identifiers:
|
||||
if unicode(x) == uuid:
|
||||
x.content = 'urn:uuid:'+uuid
|
||||
|
||||
with TemporaryDirectory(u'_epub_output') as tdir:
|
||||
from calibre.customize.ui import plugin_for_output_format
|
||||
metadata_xml = None
|
||||
@ -210,7 +220,6 @@ class EPUBOutput(OutputFormatPlugin):
|
||||
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
|
||||
self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)\
|
||||
if x.endswith('.ncx')][0])
|
||||
encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
|
||||
encryption = None
|
||||
if encrypted_fonts:
|
||||
encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid)
|
||||
|
@ -59,7 +59,10 @@ class MOBIInput(InputFormatPlugin):
|
||||
if mr.kf8_type is not None:
|
||||
log('Found KF8 MOBI of type %r'%mr.kf8_type)
|
||||
from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
|
||||
return os.path.abspath(Mobi8Reader(mr, log)())
|
||||
mr = Mobi8Reader(mr, log)
|
||||
opf = os.path.abspath(mr())
|
||||
self.encrypted_fonts = mr.encrypted_fonts
|
||||
return opf
|
||||
|
||||
raw = parse_cache.pop('calibre_raw_mobi_markup', False)
|
||||
if raw:
|
||||
|
@ -179,7 +179,7 @@ class MOBIOutput(OutputFormatPlugin):
|
||||
writer(oeb, output_path)
|
||||
|
||||
if opts.extract_to is not None:
|
||||
from calibre.ebooks.mobi.debug import inspect_mobi
|
||||
from calibre.ebooks.mobi.debug.main import inspect_mobi
|
||||
ddir = opts.extract_to
|
||||
inspect_mobi(output_path, ddir=ddir)
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
16
src/calibre/ebooks/mobi/debug/__init__.py
Normal file
16
src/calibre/ebooks/mobi/debug/__init__.py
Normal file
@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
def format_bytes(byts):
|
||||
byts = bytearray(byts)
|
||||
byts = [hex(b)[2:] for b in byts]
|
||||
return ' '.join(byts)
|
||||
|
||||
|
535
src/calibre/ebooks/mobi/debug/headers.py
Normal file
535
src/calibre/ebooks/mobi/debug/headers.py
Normal file
@ -0,0 +1,535 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import struct, datetime, os
|
||||
|
||||
from calibre.utils.date import utc_tz
|
||||
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
|
||||
from calibre.ebooks.mobi.langcodes import main_language, sub_language
|
||||
from calibre.ebooks.mobi.debug import format_bytes
|
||||
from calibre.ebooks.mobi.utils import get_trailing_data
|
||||
|
||||
# PalmDB {{{
|
||||
class PalmDOCAttributes(object):
|
||||
|
||||
class Attr(object):
|
||||
|
||||
def __init__(self, name, field, val):
|
||||
self.name = name
|
||||
self.val = val & field
|
||||
|
||||
def __str__(self):
|
||||
return '%s: %s'%(self.name, bool(self.val))
|
||||
|
||||
def __init__(self, raw):
|
||||
self.val = struct.unpack(b'<H', raw)[0]
|
||||
self.attributes = []
|
||||
for name, field in [('Read Only', 0x02), ('Dirty AppInfoArea', 0x04),
|
||||
('Backup this database', 0x08),
|
||||
('Okay to install newer over existing copy, if present on PalmPilot', 0x10),
|
||||
('Force the PalmPilot to reset after this database is installed', 0x12),
|
||||
('Don\'t allow copy of file to be beamed to other Pilot',
|
||||
0x14)]:
|
||||
self.attributes.append(PalmDOCAttributes.Attr(name, field,
|
||||
self.val))
|
||||
|
||||
def __str__(self):
|
||||
attrs = '\n\t'.join([str(x) for x in self.attributes])
|
||||
return 'PalmDOC Attributes: %s\n\t%s'%(bin(self.val), attrs)
|
||||
|
||||
class PalmDB(object):
|
||||
|
||||
def __init__(self, raw):
|
||||
self.raw = raw
|
||||
|
||||
if self.raw.startswith(b'TPZ'):
|
||||
raise ValueError('This is a Topaz file')
|
||||
|
||||
self.name = self.raw[:32].replace(b'\x00', b'')
|
||||
self.attributes = PalmDOCAttributes(self.raw[32:34])
|
||||
self.version = struct.unpack(b'>H', self.raw[34:36])[0]
|
||||
|
||||
palm_epoch = datetime.datetime(1904, 1, 1, tzinfo=utc_tz)
|
||||
self.creation_date_raw = struct.unpack(b'>I', self.raw[36:40])[0]
|
||||
self.creation_date = (palm_epoch +
|
||||
datetime.timedelta(seconds=self.creation_date_raw))
|
||||
self.modification_date_raw = struct.unpack(b'>I', self.raw[40:44])[0]
|
||||
self.modification_date = (palm_epoch +
|
||||
datetime.timedelta(seconds=self.modification_date_raw))
|
||||
self.last_backup_date_raw = struct.unpack(b'>I', self.raw[44:48])[0]
|
||||
self.last_backup_date = (palm_epoch +
|
||||
datetime.timedelta(seconds=self.last_backup_date_raw))
|
||||
self.modification_number = struct.unpack(b'>I', self.raw[48:52])[0]
|
||||
self.app_info_id = self.raw[52:56]
|
||||
self.sort_info_id = self.raw[56:60]
|
||||
self.type = self.raw[60:64]
|
||||
self.creator = self.raw[64:68]
|
||||
self.ident = self.type + self.creator
|
||||
if self.ident not in (b'BOOKMOBI', b'TEXTREAD'):
|
||||
raise ValueError('Unknown book ident: %r'%self.ident)
|
||||
self.last_record_uid, = struct.unpack(b'>I', self.raw[68:72])
|
||||
self.next_rec_list_id = self.raw[72:76]
|
||||
|
||||
self.number_of_records, = struct.unpack(b'>H', self.raw[76:78])
|
||||
|
||||
def __str__(self):
|
||||
ans = ['*'*20 + ' PalmDB Header '+ '*'*20]
|
||||
ans.append('Name: %r'%self.name)
|
||||
ans.append(str(self.attributes))
|
||||
ans.append('Version: %s'%self.version)
|
||||
ans.append('Creation date: %s (%s)'%(self.creation_date.isoformat(),
|
||||
self.creation_date_raw))
|
||||
ans.append('Modification date: %s (%s)'%(self.modification_date.isoformat(),
|
||||
self.modification_date_raw))
|
||||
ans.append('Backup date: %s (%s)'%(self.last_backup_date.isoformat(),
|
||||
self.last_backup_date_raw))
|
||||
ans.append('Modification number: %s'%self.modification_number)
|
||||
ans.append('App Info ID: %r'%self.app_info_id)
|
||||
ans.append('Sort Info ID: %r'%self.sort_info_id)
|
||||
ans.append('Type: %r'%self.type)
|
||||
ans.append('Creator: %r'%self.creator)
|
||||
ans.append('Last record UID +1: %r'%self.last_record_uid)
|
||||
ans.append('Next record list id: %r'%self.next_rec_list_id)
|
||||
ans.append('Number of records: %s'%self.number_of_records)
|
||||
|
||||
return '\n'.join(ans)
|
||||
# }}}
|
||||
|
||||
class Record(object): # {{{
|
||||
|
||||
def __init__(self, raw, header):
|
||||
self.offset, self.flags, self.uid = header
|
||||
self.raw = raw
|
||||
|
||||
@property
|
||||
def header(self):
|
||||
return 'Offset: %d Flags: %d UID: %d First 4 bytes: %r Size: %d'%(self.offset, self.flags,
|
||||
self.uid, self.raw[:4], len(self.raw))
|
||||
# }}}
|
||||
|
||||
# EXTH {{{
|
||||
class EXTHRecord(object):
|
||||
|
||||
def __init__(self, type_, data):
|
||||
self.type = type_
|
||||
self.data = data
|
||||
self.name = {
|
||||
1 : 'DRM Server id',
|
||||
2 : 'DRM Commerce id',
|
||||
3 : 'DRM ebookbase book id',
|
||||
100 : 'author',
|
||||
101 : 'publisher',
|
||||
102 : 'imprint',
|
||||
103 : 'description',
|
||||
104 : 'isbn',
|
||||
105 : 'subject',
|
||||
106 : 'publishingdate',
|
||||
107 : 'review',
|
||||
108 : 'contributor',
|
||||
109 : 'rights',
|
||||
110 : 'subjectcode',
|
||||
111 : 'type',
|
||||
112 : 'source',
|
||||
113 : 'asin',
|
||||
114 : 'versionnumber',
|
||||
115 : 'sample',
|
||||
116 : 'startreading',
|
||||
117 : 'adult',
|
||||
118 : 'retailprice',
|
||||
119 : 'retailpricecurrency',
|
||||
121 : 'KF8 header section index',
|
||||
125 : 'KF8 resources (images/fonts) count',
|
||||
129 : 'KF8 cover URI',
|
||||
131 : 'KF8 unknown count',
|
||||
201 : 'coveroffset',
|
||||
202 : 'thumboffset',
|
||||
203 : 'hasfakecover',
|
||||
204 : 'Creator Software',
|
||||
205 : 'Creator Major Version', # '>I'
|
||||
206 : 'Creator Minor Version', # '>I'
|
||||
207 : 'Creator Build Number', # '>I'
|
||||
208 : 'watermark',
|
||||
209 : 'tamper_proof_keys',
|
||||
300 : 'fontsignature',
|
||||
301 : 'clippinglimit', # percentage '>B'
|
||||
402 : 'publisherlimit',
|
||||
404 : 'TTS flag', # '>B' 1 - TTS disabled 0 - TTS enabled
|
||||
501 : 'cdetype', # 4 chars (PDOC or EBOK)
|
||||
502 : 'lastupdatetime',
|
||||
503 : 'updatedtitle',
|
||||
}.get(self.type, repr(self.type))
|
||||
|
||||
if (self.name in {'coveroffset', 'thumboffset', 'hasfakecover',
|
||||
'Creator Major Version', 'Creator Minor Version',
|
||||
'Creator Build Number', 'Creator Software', 'startreading'} or
|
||||
self.type in {121, 125, 131}):
|
||||
self.data, = struct.unpack(b'>I', self.data)
|
||||
|
||||
def __str__(self):
|
||||
return '%s (%d): %r'%(self.name, self.type, self.data)
|
||||
|
||||
class EXTHHeader(object):
|
||||
|
||||
def __init__(self, raw):
|
||||
self.raw = raw
|
||||
if not self.raw.startswith(b'EXTH'):
|
||||
raise ValueError('EXTH header does not start with EXTH')
|
||||
self.length, = struct.unpack(b'>I', self.raw[4:8])
|
||||
self.count, = struct.unpack(b'>I', self.raw[8:12])
|
||||
|
||||
pos = 12
|
||||
self.records = []
|
||||
for i in xrange(self.count):
|
||||
pos = self.read_record(pos)
|
||||
self.records.sort(key=lambda x:x.type)
|
||||
self.rmap = {x.type:x for x in self.records}
|
||||
|
||||
def __getitem__(self, type_):
|
||||
return self.rmap.__getitem__(type_).data
|
||||
|
||||
def get(self, type_, default=None):
|
||||
ans = self.rmap.get(type_, default)
|
||||
return getattr(ans, 'data', default)
|
||||
|
||||
def read_record(self, pos):
|
||||
type_, length = struct.unpack(b'>II', self.raw[pos:pos+8])
|
||||
data = self.raw[(pos+8):(pos+length)]
|
||||
self.records.append(EXTHRecord(type_, data))
|
||||
return pos + length
|
||||
|
||||
@property
|
||||
def kf8_header_index(self):
|
||||
return self.get(121, None)
|
||||
|
||||
def __str__(self):
|
||||
ans = ['*'*20 + ' EXTH Header '+ '*'*20]
|
||||
ans.append('EXTH header length: %d'%self.length)
|
||||
ans.append('Number of EXTH records: %d'%self.count)
|
||||
ans.append('EXTH records...')
|
||||
for r in self.records:
|
||||
ans.append(str(r))
|
||||
return '\n'.join(ans)
|
||||
# }}}
|
||||
|
||||
class MOBIHeader(object): # {{{
|
||||
|
||||
def __init__(self, record0, offset):
|
||||
self.raw = record0.raw
|
||||
self.header_offset = offset
|
||||
|
||||
self.compression_raw = self.raw[:2]
|
||||
self.compression = {1: 'No compression', 2: 'PalmDoc compression',
|
||||
17480: 'HUFF/CDIC compression'}.get(struct.unpack(b'>H',
|
||||
self.compression_raw)[0],
|
||||
repr(self.compression_raw))
|
||||
self.unused = self.raw[2:4]
|
||||
self.text_length, = struct.unpack(b'>I', self.raw[4:8])
|
||||
self.number_of_text_records, self.text_record_size = \
|
||||
struct.unpack(b'>HH', self.raw[8:12])
|
||||
self.encryption_type_raw, = struct.unpack(b'>H', self.raw[12:14])
|
||||
self.encryption_type = {
|
||||
0: 'No encryption',
|
||||
1: 'Old mobipocket encryption',
|
||||
2: 'Mobipocket encryption'
|
||||
}.get(self.encryption_type_raw, repr(self.encryption_type_raw))
|
||||
self.unknown = self.raw[14:16]
|
||||
|
||||
self.identifier = self.raw[16:20]
|
||||
if self.identifier != b'MOBI':
|
||||
raise ValueError('Identifier %r unknown'%self.identifier)
|
||||
|
||||
self.length, = struct.unpack(b'>I', self.raw[20:24])
|
||||
self.type_raw, = struct.unpack(b'>I', self.raw[24:28])
|
||||
self.type = {
|
||||
2 : 'Mobipocket book',
|
||||
3 : 'PalmDOC book',
|
||||
4 : 'Audio',
|
||||
257 : 'News',
|
||||
258 : 'News Feed',
|
||||
259 : 'News magazine',
|
||||
513 : 'PICS',
|
||||
514 : 'Word',
|
||||
515 : 'XLS',
|
||||
516 : 'PPT',
|
||||
517 : 'TEXT',
|
||||
518 : 'HTML',
|
||||
}.get(self.type_raw, repr(self.type_raw))
|
||||
|
||||
self.encoding_raw, = struct.unpack(b'>I', self.raw[28:32])
|
||||
self.encoding = {
|
||||
1252 : 'cp1252',
|
||||
65001: 'utf-8',
|
||||
}.get(self.encoding_raw, repr(self.encoding_raw))
|
||||
self.uid = self.raw[32:36]
|
||||
self.file_version, = struct.unpack(b'>I', self.raw[36:40])
|
||||
self.meta_orth_indx, self.meta_infl_indx = struct.unpack(
|
||||
b'>II', self.raw[40:48])
|
||||
self.secondary_index_record, = struct.unpack(b'>I', self.raw[48:52])
|
||||
self.reserved = self.raw[52:80]
|
||||
self.first_non_book_record, = struct.unpack(b'>I', self.raw[80:84])
|
||||
self.fullname_offset, = struct.unpack(b'>I', self.raw[84:88])
|
||||
self.fullname_length, = struct.unpack(b'>I', self.raw[88:92])
|
||||
self.locale_raw, = struct.unpack(b'>I', self.raw[92:96])
|
||||
langcode = self.locale_raw
|
||||
langid = langcode & 0xFF
|
||||
sublangid = (langcode >> 10) & 0xFF
|
||||
self.language = main_language.get(langid, 'ENGLISH')
|
||||
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
|
||||
|
||||
self.input_language = self.raw[96:100]
|
||||
self.output_langauage = self.raw[100:104]
|
||||
self.min_version, = struct.unpack(b'>I', self.raw[104:108])
|
||||
self.first_image_index, = struct.unpack(b'>I', self.raw[108:112])
|
||||
self.huffman_record_offset, = struct.unpack(b'>I', self.raw[112:116])
|
||||
self.huffman_record_count, = struct.unpack(b'>I', self.raw[116:120])
|
||||
self.datp_record_offset, = struct.unpack(b'>I', self.raw[120:124])
|
||||
self.datp_record_count, = struct.unpack(b'>I', self.raw[124:128])
|
||||
self.exth_flags, = struct.unpack(b'>I', self.raw[128:132])
|
||||
self.has_exth = bool(self.exth_flags & 0x40)
|
||||
self.has_drm_data = self.length >= 174 and len(self.raw) >= 180
|
||||
if self.has_drm_data:
|
||||
self.unknown3 = self.raw[132:164]
|
||||
self.drm_offset, = struct.unpack(b'>I', self.raw[164:168])
|
||||
self.drm_count, = struct.unpack(b'>I', self.raw[168:172])
|
||||
self.drm_size, = struct.unpack(b'>I', self.raw[172:176])
|
||||
self.drm_flags = bin(struct.unpack(b'>I', self.raw[176:180])[0])
|
||||
self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16
|
||||
self.has_fcis_flis = False
|
||||
self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False
|
||||
self.extra_data_flags = 0
|
||||
if self.has_extra_data_flags:
|
||||
self.unknown4 = self.raw[180:192]
|
||||
self.fdst_idx, self.fdst_count = struct.unpack_from(b'>II',
|
||||
self.raw, 192)
|
||||
(self.fcis_number, self.fcis_count, self.flis_number,
|
||||
self.flis_count) = struct.unpack(b'>IIII',
|
||||
self.raw[200:216])
|
||||
self.unknown6 = self.raw[216:224]
|
||||
self.srcs_record_index = struct.unpack(b'>I',
|
||||
self.raw[224:228])[0]
|
||||
self.num_srcs_records = struct.unpack(b'>I',
|
||||
self.raw[228:232])[0]
|
||||
self.unknown7 = self.raw[232:240]
|
||||
self.extra_data_flags = struct.unpack(b'>I',
|
||||
self.raw[240:244])[0]
|
||||
self.has_multibytes = bool(self.extra_data_flags & 0b1)
|
||||
self.has_indexing_bytes = bool(self.extra_data_flags & 0b10)
|
||||
self.has_uncrossable_breaks = bool(self.extra_data_flags & 0b100)
|
||||
self.primary_index_record, = struct.unpack(b'>I',
|
||||
self.raw[244:248])
|
||||
|
||||
if self.file_version >= 8:
|
||||
(self.sect_idx, self.skel_idx, self.datp_idx, self.oth_idx
|
||||
) = struct.unpack_from(b'>4L', self.raw, 248)
|
||||
self.unknown9 = self.raw[264:self.length]
|
||||
if self.meta_orth_indx != self.sect_idx:
|
||||
raise ValueError('KF8 header has different Meta orth and '
|
||||
'section indices')
|
||||
|
||||
# The following are all relative to the position of the header record
|
||||
# make them absolute for ease of debugging
|
||||
for x in ('sect_idx', 'skel_idx', 'datp_idx', 'oth_idx',
|
||||
'meta_orth_indx', 'huffman_record_offset',
|
||||
'first_non_book_record', 'datp_record_offset', 'fcis_number',
|
||||
'flis_number', 'primary_index_record', 'fdst_idx',
|
||||
'first_image_index'):
|
||||
if hasattr(self, x):
|
||||
setattr(self, x, self.header_offset+getattr(self, x))
|
||||
|
||||
if self.has_exth:
|
||||
self.exth_offset = 16 + self.length
|
||||
|
||||
self.exth = EXTHHeader(self.raw[self.exth_offset:])
|
||||
|
||||
self.end_of_exth = self.exth_offset + self.exth.length
|
||||
self.bytes_after_exth = self.raw[self.end_of_exth:self.fullname_offset]
|
||||
|
||||
def __str__(self):
|
||||
ans = ['*'*20 + ' MOBI %d Header '%self.file_version+ '*'*20]
|
||||
a = ans.append
|
||||
i = lambda d, x : a('%s (null value: %d): %d'%(d, NULL_INDEX, x))
|
||||
ans.append('Compression: %s'%self.compression)
|
||||
ans.append('Unused: %r'%self.unused)
|
||||
ans.append('Number of text records: %d'%self.number_of_text_records)
|
||||
ans.append('Text record size: %d'%self.text_record_size)
|
||||
ans.append('Encryption: %s'%self.encryption_type)
|
||||
ans.append('Unknown: %r'%self.unknown)
|
||||
ans.append('Identifier: %r'%self.identifier)
|
||||
ans.append('Header length: %d'% self.length)
|
||||
ans.append('Type: %s'%self.type)
|
||||
ans.append('Encoding: %s'%self.encoding)
|
||||
ans.append('UID: %r'%self.uid)
|
||||
ans.append('File version: %d'%self.file_version)
|
||||
i('Meta Orth Index (Sections index in KF8)', self.meta_orth_indx)
|
||||
i('Meta Infl Index', self.meta_infl_indx)
|
||||
ans.append('Secondary index record: %d (null val: %d)'%(
|
||||
self.secondary_index_record, NULL_INDEX))
|
||||
ans.append('Reserved: %r'%self.reserved)
|
||||
ans.append('First non-book record (null value: %d): %d'%(NULL_INDEX,
|
||||
self.first_non_book_record))
|
||||
ans.append('Full name offset: %d'%self.fullname_offset)
|
||||
ans.append('Full name length: %d bytes'%self.fullname_length)
|
||||
ans.append('Langcode: %r'%self.locale_raw)
|
||||
ans.append('Language: %s'%self.language)
|
||||
ans.append('Sub language: %s'%self.sublanguage)
|
||||
ans.append('Input language: %r'%self.input_language)
|
||||
ans.append('Output language: %r'%self.output_langauage)
|
||||
ans.append('Min version: %d'%self.min_version)
|
||||
ans.append('First Image index: %d'%self.first_image_index)
|
||||
ans.append('Huffman record offset: %d'%self.huffman_record_offset)
|
||||
ans.append('Huffman record count: %d'%self.huffman_record_count)
|
||||
ans.append('DATP record offset: %r'%self.datp_record_offset)
|
||||
ans.append('DATP record count: %r'%self.datp_record_count)
|
||||
ans.append('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth))
|
||||
if self.has_drm_data:
|
||||
ans.append('Unknown3: %r'%self.unknown3)
|
||||
ans.append('DRM Offset: %s'%self.drm_offset)
|
||||
ans.append('DRM Count: %s'%self.drm_count)
|
||||
ans.append('DRM Size: %s'%self.drm_size)
|
||||
ans.append('DRM Flags: %r'%self.drm_flags)
|
||||
if self.has_extra_data_flags:
|
||||
ans.append('Unknown4: %r'%self.unknown4)
|
||||
ans.append('FDST Index: %d'% self.fdst_idx)
|
||||
ans.append('FDST Count: %d'% self.fdst_count)
|
||||
ans.append('FCIS number: %d'% self.fcis_number)
|
||||
ans.append('FCIS count: %d'% self.fcis_count)
|
||||
ans.append('FLIS number: %d'% self.flis_number)
|
||||
ans.append('FLIS count: %d'% self.flis_count)
|
||||
ans.append('Unknown6: %r'% self.unknown6)
|
||||
ans.append('SRCS record index: %d'%self.srcs_record_index)
|
||||
ans.append('Number of SRCS records?: %d'%self.num_srcs_records)
|
||||
ans.append('Unknown7: %r'%self.unknown7)
|
||||
ans.append(('Extra data flags: %s (has multibyte: %s) '
|
||||
'(has indexing: %s) (has uncrossable breaks: %s)')%(
|
||||
bin(self.extra_data_flags), self.has_multibytes,
|
||||
self.has_indexing_bytes, self.has_uncrossable_breaks ))
|
||||
ans.append('Primary index record (null value: %d): %d'%(NULL_INDEX,
|
||||
self.primary_index_record))
|
||||
if self.file_version >= 8:
|
||||
i('Sections Index', self.sect_idx)
|
||||
i('SKEL Index', self.skel_idx)
|
||||
i('DATP Index', self.datp_idx)
|
||||
i('Other Index', self.oth_idx)
|
||||
if self.unknown9:
|
||||
a('Unknown9: %r'%self.unknown9)
|
||||
|
||||
ans = '\n'.join(ans)
|
||||
|
||||
if self.has_exth:
|
||||
ans += '\n\n' + str(self.exth)
|
||||
ans += '\n\nBytes after EXTH (%d bytes): %s'%(
|
||||
len(self.bytes_after_exth),
|
||||
format_bytes(self.bytes_after_exth))
|
||||
|
||||
ans += '\nNumber of bytes after full name: %d' % (len(self.raw) - (self.fullname_offset +
|
||||
self.fullname_length))
|
||||
|
||||
ans += '\nRecord 0 length: %d'%len(self.raw)
|
||||
return ans
|
||||
# }}}
|
||||
|
||||
class MOBIFile(object):
|
||||
|
||||
def __init__(self, stream):
|
||||
self.raw = stream.read()
|
||||
self.palmdb = PalmDB(self.raw[:78])
|
||||
|
||||
self.record_headers = []
|
||||
self.records = []
|
||||
for i in xrange(self.palmdb.number_of_records):
|
||||
pos = 78 + i * 8
|
||||
offset, a1, a2, a3, a4 = struct.unpack(b'>LBBBB', self.raw[pos:pos+8])
|
||||
flags, val = a1, a2 << 16 | a3 << 8 | a4
|
||||
self.record_headers.append((offset, flags, val))
|
||||
|
||||
def section(section_number):
|
||||
if section_number == self.palmdb.number_of_records - 1:
|
||||
end_off = len(self.raw)
|
||||
else:
|
||||
end_off = self.record_headers[section_number + 1][0]
|
||||
off = self.record_headers[section_number][0]
|
||||
return self.raw[off:end_off]
|
||||
|
||||
for i in range(self.palmdb.number_of_records):
|
||||
self.records.append(Record(section(i), self.record_headers[i]))
|
||||
|
||||
self.mobi_header = MOBIHeader(self.records[0], 0)
|
||||
self.huffman_record_nums = []
|
||||
|
||||
self.kf8_type = None
|
||||
mh = mh8 = self.mobi_header
|
||||
if mh.file_version >= 8:
|
||||
self.kf8_type = 'standalone'
|
||||
elif mh.has_exth and mh.exth.kf8_header_index is not None:
|
||||
self.kf8_type = 'joint'
|
||||
kf8i = mh.exth.kf8_header_index
|
||||
mh8 = MOBIHeader(self.records[kf8i], kf8i)
|
||||
self.mobi8_header = mh8
|
||||
|
||||
if 'huff' in self.mobi_header.compression.lower():
|
||||
from calibre.ebooks.mobi.huffcdic import HuffReader
|
||||
|
||||
def huffit(off, cnt):
|
||||
huffman_record_nums = list(xrange(off, off+cnt))
|
||||
huffrecs = [self.records[r].raw for r in huffman_record_nums]
|
||||
huffs = HuffReader(huffrecs)
|
||||
return huffman_record_nums, huffs.unpack
|
||||
|
||||
if self.kf8_type == 'joint':
|
||||
recs6, d6 = huffit(mh.huffman_record_offset,
|
||||
mh.huffman_record_count)
|
||||
recs8, d8 = huffit(mh8.huffman_record_offset,
|
||||
mh8.huffman_record_count)
|
||||
self.huffman_record_nums = recs6 + recs8
|
||||
else:
|
||||
self.huffman_record_nums, d6 = huffit(mh.huffman_record_offset,
|
||||
mh.huffman_record_count)
|
||||
d8 = d6
|
||||
elif 'palmdoc' in self.mobi_header.compression.lower():
|
||||
from calibre.ebooks.compression.palmdoc import decompress_doc
|
||||
d8 = d6 = decompress_doc
|
||||
else:
|
||||
d8 = d6 = lambda x: x
|
||||
|
||||
self.decompress6, self.decompress8 = d6, d8
|
||||
|
||||
class TextRecord(object): # {{{
|
||||
|
||||
def __init__(self, idx, record, extra_data_flags, decompress):
|
||||
self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags)
|
||||
raw_trailing_bytes = record.raw[len(self.raw):]
|
||||
self.raw = decompress(self.raw)
|
||||
|
||||
if 0 in self.trailing_data:
|
||||
self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0)
|
||||
if 1 in self.trailing_data:
|
||||
self.trailing_data['indexing'] = self.trailing_data.pop(1)
|
||||
if 2 in self.trailing_data:
|
||||
self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2)
|
||||
self.trailing_data['raw_bytes'] = raw_trailing_bytes
|
||||
|
||||
for typ, val in self.trailing_data.iteritems():
|
||||
if isinstance(typ, int):
|
||||
print ('Record %d has unknown trailing data of type: %d : %r'%
|
||||
(idx, typ, val))
|
||||
|
||||
self.idx = idx
|
||||
|
||||
def dump(self, folder):
|
||||
name = '%06d'%self.idx
|
||||
with open(os.path.join(folder, name+'.txt'), 'wb') as f:
|
||||
f.write(self.raw)
|
||||
with open(os.path.join(folder, name+'.trailing_data'), 'wb') as f:
|
||||
for k, v in self.trailing_data.iteritems():
|
||||
raw = '%s : %r\n\n'%(k, v)
|
||||
f.write(raw.encode('utf-8'))
|
||||
|
||||
# }}}
|
||||
|
||||
|
48
src/calibre/ebooks/mobi/debug/main.py
Normal file
48
src/calibre/ebooks/mobi/debug/main.py
Normal file
@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import sys, os, shutil
|
||||
|
||||
from calibre.ebooks.mobi.debug.headers import MOBIFile
|
||||
from calibre.ebooks.mobi.debug.mobi6 import inspect_mobi as inspect_mobi6
|
||||
from calibre.ebooks.mobi.debug.mobi8 import inspect_mobi as inspect_mobi8
|
||||
|
||||
def inspect_mobi(path_or_stream, ddir=None): # {{{
|
||||
stream = (path_or_stream if hasattr(path_or_stream, 'read') else
|
||||
open(path_or_stream, 'rb'))
|
||||
f = MOBIFile(stream)
|
||||
if ddir is None:
|
||||
ddir = 'decompiled_' + os.path.splitext(os.path.basename(stream.name))[0]
|
||||
try:
|
||||
shutil.rmtree(ddir)
|
||||
except:
|
||||
pass
|
||||
os.makedirs(ddir)
|
||||
if f.kf8_type is None:
|
||||
inspect_mobi6(f, ddir)
|
||||
elif f.kf8_type == 'joint':
|
||||
p6 = os.path.join(ddir, 'mobi6')
|
||||
os.mkdir(p6)
|
||||
inspect_mobi6(f, p6)
|
||||
p8 = os.path.join(ddir, 'mobi8')
|
||||
os.mkdir(p8)
|
||||
inspect_mobi8(f, p8)
|
||||
else:
|
||||
inspect_mobi8(f, ddir)
|
||||
|
||||
print ('Debug data saved to:', ddir)
|
||||
|
||||
# }}}
|
||||
|
||||
def main():
|
||||
inspect_mobi(sys.argv[1])
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
839
src/calibre/ebooks/mobi/debug/mobi6.py
Normal file
839
src/calibre/ebooks/mobi/debug/mobi6.py
Normal file
@ -0,0 +1,839 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import struct, sys, os
|
||||
from collections import OrderedDict, defaultdict
|
||||
|
||||
from lxml import html
|
||||
|
||||
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
|
||||
from calibre.ebooks.mobi.reader.index import (parse_index_record,
|
||||
parse_tagx_section)
|
||||
from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
|
||||
decode_tbs, read_font_record)
|
||||
from calibre.utils.magick.draw import identify_data
|
||||
from calibre.ebooks.mobi.debug import format_bytes
|
||||
from calibre.ebooks.mobi.debug.headers import TextRecord
|
||||
|
||||
|
||||
class TagX(object): # {{{
|
||||
|
||||
def __init__(self, tag, num_values, bitmask, eof):
|
||||
self.tag, self.num_values, self.bitmask, self.eof = (tag, num_values,
|
||||
bitmask, eof)
|
||||
self.num_of_values = num_values
|
||||
self.is_eof = (self.eof == 1 and self.tag == 0 and self.num_values == 0
|
||||
and self.bitmask == 0)
|
||||
|
||||
def __repr__(self):
|
||||
return 'TAGX(tag=%02d, num_values=%d, bitmask=%r, eof=%d)' % (self.tag,
|
||||
self.num_values, bin(self.bitmask), self.eof)
|
||||
# }}}
|
||||
|
||||
class SecondaryIndexHeader(object): # {{{
|
||||
|
||||
def __init__(self, record):
|
||||
self.record = record
|
||||
raw = self.record.raw
|
||||
#open('/t/index_header.bin', 'wb').write(raw)
|
||||
if raw[:4] != b'INDX':
|
||||
raise ValueError('Invalid Secondary Index Record')
|
||||
self.header_length, = struct.unpack('>I', raw[4:8])
|
||||
self.unknown1 = raw[8:16]
|
||||
self.index_type, = struct.unpack('>I', raw[16:20])
|
||||
self.index_type_desc = {0: 'normal', 2:
|
||||
'inflection', 6: 'calibre'}.get(self.index_type, 'unknown')
|
||||
self.idxt_start, = struct.unpack('>I', raw[20:24])
|
||||
self.index_count, = struct.unpack('>I', raw[24:28])
|
||||
self.index_encoding_num, = struct.unpack('>I', raw[28:32])
|
||||
self.index_encoding = {65001: 'utf-8', 1252:
|
||||
'cp1252'}.get(self.index_encoding_num, 'unknown')
|
||||
if self.index_encoding == 'unknown':
|
||||
raise ValueError(
|
||||
'Unknown index encoding: %d'%self.index_encoding_num)
|
||||
self.unknown2 = raw[32:36]
|
||||
self.num_index_entries, = struct.unpack('>I', raw[36:40])
|
||||
self.ordt_start, = struct.unpack('>I', raw[40:44])
|
||||
self.ligt_start, = struct.unpack('>I', raw[44:48])
|
||||
self.num_of_ligt_entries, = struct.unpack('>I', raw[48:52])
|
||||
self.num_of_cncx_blocks, = struct.unpack('>I', raw[52:56])
|
||||
self.unknown3 = raw[56:180]
|
||||
self.tagx_offset, = struct.unpack(b'>I', raw[180:184])
|
||||
if self.tagx_offset != self.header_length:
|
||||
raise ValueError('TAGX offset and header length disagree')
|
||||
self.unknown4 = raw[184:self.header_length]
|
||||
|
||||
tagx = raw[self.header_length:]
|
||||
if not tagx.startswith(b'TAGX'):
|
||||
raise ValueError('Invalid TAGX section')
|
||||
self.tagx_header_length, = struct.unpack('>I', tagx[4:8])
|
||||
self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12])
|
||||
self.tagx_entries = [TagX(*x) for x in parse_tagx_section(tagx)[1]]
|
||||
if self.tagx_entries and not self.tagx_entries[-1].is_eof:
|
||||
raise ValueError('TAGX last entry is not EOF')
|
||||
|
||||
idxt0_pos = self.header_length+self.tagx_header_length
|
||||
num = ord(raw[idxt0_pos])
|
||||
count_pos = idxt0_pos+1+num
|
||||
self.last_entry = raw[idxt0_pos+1:count_pos]
|
||||
self.ncx_count, = struct.unpack(b'>H', raw[count_pos:count_pos+2])
|
||||
|
||||
# There may be some alignment zero bytes between the end of the idxt0
|
||||
# and self.idxt_start
|
||||
idxt = raw[self.idxt_start:]
|
||||
if idxt[:4] != b'IDXT':
|
||||
raise ValueError('Invalid IDXT header')
|
||||
length_check, = struct.unpack(b'>H', idxt[4:6])
|
||||
if length_check != self.header_length + self.tagx_header_length:
|
||||
raise ValueError('Length check failed')
|
||||
if idxt[6:].replace(b'\0', b''):
|
||||
raise ValueError('Non null trailing bytes after IDXT')
|
||||
|
||||
|
||||
def __str__(self):
|
||||
ans = ['*'*20 + ' Secondary Index Header '+ '*'*20]
|
||||
a = ans.append
|
||||
def u(w):
|
||||
a('Unknown: %r (%d bytes) (All zeros: %r)'%(w,
|
||||
len(w), not bool(w.replace(b'\0', b'')) ))
|
||||
|
||||
a('Header length: %d'%self.header_length)
|
||||
u(self.unknown1)
|
||||
a('Index Type: %s (%d)'%(self.index_type_desc, self.index_type))
|
||||
a('Offset to IDXT start: %d'%self.idxt_start)
|
||||
a('Number of index records: %d'%self.index_count)
|
||||
a('Index encoding: %s (%d)'%(self.index_encoding,
|
||||
self.index_encoding_num))
|
||||
u(self.unknown2)
|
||||
a('Number of index entries: %d'% self.num_index_entries)
|
||||
a('ORDT start: %d'%self.ordt_start)
|
||||
a('LIGT start: %d'%self.ligt_start)
|
||||
a('Number of LIGT entries: %d'%self.num_of_ligt_entries)
|
||||
a('Number of cncx blocks: %d'%self.num_of_cncx_blocks)
|
||||
u(self.unknown3)
|
||||
a('TAGX offset: %d'%self.tagx_offset)
|
||||
u(self.unknown4)
|
||||
a('\n\n')
|
||||
a('*'*20 + ' TAGX Header (%d bytes)'%self.tagx_header_length+ '*'*20)
|
||||
a('Header length: %d'%self.tagx_header_length)
|
||||
a('Control byte count: %d'%self.tagx_control_byte_count)
|
||||
for i in self.tagx_entries:
|
||||
a('\t' + repr(i))
|
||||
a('Index of last IndexEntry in secondary index record: %s'% self.last_entry)
|
||||
a('Number of entries in the NCX: %d'% self.ncx_count)
|
||||
|
||||
return '\n'.join(ans)
|
||||
|
||||
# }}}
|
||||
|
||||
class IndexHeader(object): # {{{
|
||||
|
||||
def __init__(self, record):
|
||||
self.record = record
|
||||
raw = self.record.raw
|
||||
#open('/t/index_header.bin', 'wb').write(raw)
|
||||
if raw[:4] != b'INDX':
|
||||
raise ValueError('Invalid Primary Index Record')
|
||||
|
||||
self.header_length, = struct.unpack('>I', raw[4:8])
|
||||
self.unknown1 = raw[8:12]
|
||||
self.header_type, = struct.unpack('>I', raw[12:16])
|
||||
self.index_type, = struct.unpack('>I', raw[16:20])
|
||||
self.index_type_desc = {0: 'normal', 2:
|
||||
'inflection', 6: 'calibre'}.get(self.index_type, 'unknown')
|
||||
self.idxt_start, = struct.unpack('>I', raw[20:24])
|
||||
self.index_count, = struct.unpack('>I', raw[24:28])
|
||||
self.index_encoding_num, = struct.unpack('>I', raw[28:32])
|
||||
self.index_encoding = {65001: 'utf-8', 1252:
|
||||
'cp1252'}.get(self.index_encoding_num, 'unknown')
|
||||
if self.index_encoding == 'unknown':
|
||||
raise ValueError(
|
||||
'Unknown index encoding: %d'%self.index_encoding_num)
|
||||
self.possibly_language = raw[32:36]
|
||||
self.num_index_entries, = struct.unpack('>I', raw[36:40])
|
||||
self.ordt_start, = struct.unpack('>I', raw[40:44])
|
||||
self.ligt_start, = struct.unpack('>I', raw[44:48])
|
||||
self.num_of_ligt_entries, = struct.unpack('>I', raw[48:52])
|
||||
self.num_of_cncx_blocks, = struct.unpack('>I', raw[52:56])
|
||||
self.unknown2 = raw[56:180]
|
||||
self.tagx_offset, = struct.unpack(b'>I', raw[180:184])
|
||||
if self.tagx_offset != self.header_length:
|
||||
raise ValueError('TAGX offset and header length disagree')
|
||||
self.unknown3 = raw[184:self.header_length]
|
||||
|
||||
tagx = raw[self.header_length:]
|
||||
if not tagx.startswith(b'TAGX'):
|
||||
raise ValueError('Invalid TAGX section')
|
||||
self.tagx_header_length, = struct.unpack('>I', tagx[4:8])
|
||||
self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12])
|
||||
self.tagx_entries = [TagX(*x) for x in parse_tagx_section(tagx)[1]]
|
||||
if self.tagx_entries and not self.tagx_entries[-1].is_eof:
|
||||
raise ValueError('TAGX last entry is not EOF')
|
||||
|
||||
idxt0_pos = self.header_length+self.tagx_header_length
|
||||
last_num, consumed = decode_hex_number(raw[idxt0_pos:])
|
||||
count_pos = idxt0_pos + consumed
|
||||
self.ncx_count, = struct.unpack(b'>H', raw[count_pos:count_pos+2])
|
||||
self.last_entry = last_num
|
||||
|
||||
if last_num != self.ncx_count - 1:
|
||||
raise ValueError('Last id number in the NCX != NCX count - 1')
|
||||
# There may be some alignment zero bytes between the end of the idxt0
|
||||
# and self.idxt_start
|
||||
|
||||
idxt = raw[self.idxt_start:]
|
||||
if idxt[:4] != b'IDXT':
|
||||
raise ValueError('Invalid IDXT header')
|
||||
length_check, = struct.unpack(b'>H', idxt[4:6])
|
||||
if length_check != self.header_length + self.tagx_header_length:
|
||||
raise ValueError('Length check failed')
|
||||
if idxt[6:].replace(b'\0', b''):
|
||||
raise ValueError('Non null trailing bytes after IDXT')
|
||||
|
||||
|
||||
def __str__(self):
|
||||
ans = ['*'*20 + ' Index Header (%d bytes)'%len(self.record.raw)+ '*'*20]
|
||||
a = ans.append
|
||||
def u(w):
|
||||
a('Unknown: %r (%d bytes) (All zeros: %r)'%(w,
|
||||
len(w), not bool(w.replace(b'\0', b'')) ))
|
||||
|
||||
a('Header length: %d'%self.header_length)
|
||||
u(self.unknown1)
|
||||
a('Header type: %d'%self.header_type)
|
||||
a('Index Type: %s (%d)'%(self.index_type_desc, self.index_type))
|
||||
a('Offset to IDXT start: %d'%self.idxt_start)
|
||||
a('Number of index records: %d'%self.index_count)
|
||||
a('Index encoding: %s (%d)'%(self.index_encoding,
|
||||
self.index_encoding_num))
|
||||
a('Unknown (possibly language?): %r'%(self.possibly_language))
|
||||
a('Number of index entries: %d'% self.num_index_entries)
|
||||
a('ORDT start: %d'%self.ordt_start)
|
||||
a('LIGT start: %d'%self.ligt_start)
|
||||
a('Number of LIGT entries: %d'%self.num_of_ligt_entries)
|
||||
a('Number of cncx blocks: %d'%self.num_of_cncx_blocks)
|
||||
u(self.unknown2)
|
||||
a('TAGX offset: %d'%self.tagx_offset)
|
||||
u(self.unknown3)
|
||||
a('\n\n')
|
||||
a('*'*20 + ' TAGX Header (%d bytes)'%self.tagx_header_length+ '*'*20)
|
||||
a('Header length: %d'%self.tagx_header_length)
|
||||
a('Control byte count: %d'%self.tagx_control_byte_count)
|
||||
for i in self.tagx_entries:
|
||||
a('\t' + repr(i))
|
||||
a('Index of last IndexEntry in primary index record: %s'% self.last_entry)
|
||||
a('Number of entries in the NCX: %d'% self.ncx_count)
|
||||
|
||||
return '\n'.join(ans)
|
||||
# }}}
|
||||
|
||||
class Tag(object): # {{{
|
||||
|
||||
'''
|
||||
Index entries are a collection of tags. Each tag is represented by this
|
||||
class.
|
||||
'''
|
||||
|
||||
TAG_MAP = {
|
||||
1: ('offset', 'Offset in HTML'),
|
||||
2: ('size', 'Size in HTML'),
|
||||
3: ('label_offset', 'Label offset in CNCX'),
|
||||
4: ('depth', 'Depth of this entry in TOC'),
|
||||
5: ('class_offset', 'Class offset in CNCX'),
|
||||
6: ('pos_fid', 'File Index'),
|
||||
|
||||
11: ('secondary', '[unknown, unknown, '
|
||||
'tag type from TAGX in primary index header]'),
|
||||
|
||||
21: ('parent_index', 'Parent'),
|
||||
22: ('first_child_index', 'First child'),
|
||||
23: ('last_child_index', 'Last child'),
|
||||
|
||||
69 : ('image_index', 'Offset from first image record to the'
|
||||
' image record associated with this entry'
|
||||
' (masthead for periodical or thumbnail for'
|
||||
' article entry).'),
|
||||
70 : ('desc_offset', 'Description offset in cncx'),
|
||||
71 : ('author_offset', 'Author offset in cncx'),
|
||||
72 : ('image_caption_offset', 'Image caption offset in cncx'),
|
||||
73 : ('image_attr_offset', 'Image attribution offset in cncx'),
|
||||
|
||||
}
|
||||
|
||||
def __init__(self, tag_type, vals, cncx):
|
||||
self.value = vals if len(vals) > 1 else vals[0] if vals else None
|
||||
|
||||
self.cncx_value = None
|
||||
if tag_type in self.TAG_MAP:
|
||||
self.attr, self.desc = self.TAG_MAP[tag_type]
|
||||
else:
|
||||
print ('Unknown tag value: %%s'%tag_type)
|
||||
self.desc = '??Unknown (tag value: %d)'%tag_type
|
||||
self.attr = 'unknown'
|
||||
|
||||
if '_offset' in self.attr:
|
||||
self.cncx_value = cncx[self.value]
|
||||
|
||||
def __str__(self):
|
||||
if self.cncx_value is not None:
|
||||
return '%s : %r [%r]'%(self.desc, self.value, self.cncx_value)
|
||||
return '%s : %r'%(self.desc, self.value)
|
||||
|
||||
# }}}
|
||||
|
||||
class IndexEntry(object): # {{{
|
||||
|
||||
'''
|
||||
The index is made up of entries, each of which is represented by an
|
||||
instance of this class. Index entries typically point to offsets in the
|
||||
HTML, specify HTML sizes and point to text strings in the CNCX that are
|
||||
used in the navigation UI.
|
||||
'''
|
||||
|
||||
def __init__(self, ident, entry, cncx):
|
||||
try:
|
||||
self.index = int(ident, 16)
|
||||
except ValueError:
|
||||
self.index = ident
|
||||
self.tags = [Tag(tag_type, vals, cncx) for tag_type, vals in
|
||||
entry.iteritems()]
|
||||
|
||||
@property
|
||||
def label(self):
|
||||
for tag in self.tags:
|
||||
if tag.attr == 'label_offset':
|
||||
return tag.cncx_value
|
||||
return ''
|
||||
|
||||
@property
|
||||
def offset(self):
|
||||
for tag in self.tags:
|
||||
if tag.attr == 'offset':
|
||||
return tag.value
|
||||
return 0
|
||||
|
||||
@property
|
||||
def size(self):
|
||||
for tag in self.tags:
|
||||
if tag.attr == 'size':
|
||||
return tag.value
|
||||
return 0
|
||||
|
||||
@property
|
||||
def depth(self):
|
||||
for tag in self.tags:
|
||||
if tag.attr == 'depth':
|
||||
return tag.value
|
||||
return 0
|
||||
|
||||
@property
|
||||
def parent_index(self):
|
||||
for tag in self.tags:
|
||||
if tag.attr == 'parent_index':
|
||||
return tag.value
|
||||
return -1
|
||||
|
||||
@property
|
||||
def first_child_index(self):
|
||||
for tag in self.tags:
|
||||
if tag.attr == 'first_child_index':
|
||||
return tag.value
|
||||
return -1
|
||||
|
||||
@property
|
||||
def last_child_index(self):
|
||||
for tag in self.tags:
|
||||
if tag.attr == 'last_child_index':
|
||||
return tag.value
|
||||
return -1
|
||||
|
||||
@property
|
||||
def pos_fid(self):
|
||||
for tag in self.tags:
|
||||
if tag.attr == 'pos_fid':
|
||||
return tag.value
|
||||
return [0, 0]
|
||||
|
||||
def __str__(self):
|
||||
ans = ['Index Entry(index=%s, length=%d)'%(
|
||||
self.index, len(self.tags))]
|
||||
for tag in self.tags:
|
||||
if tag.value is not None:
|
||||
ans.append('\t'+str(tag))
|
||||
if self.first_child_index != -1:
|
||||
ans.append('\tNumber of children: %d'%(self.last_child_index -
|
||||
self.first_child_index + 1))
|
||||
return '\n'.join(ans)
|
||||
|
||||
# }}}
|
||||
|
||||
class IndexRecord(object): # {{{
|
||||
|
||||
'''
|
||||
Represents all indexing information in the MOBI, apart from indexing info
|
||||
in the trailing data of the text records.
|
||||
'''
|
||||
|
||||
def __init__(self, records, index_header, cncx):
|
||||
self.alltext = None
|
||||
table = OrderedDict()
|
||||
tags = [TagX(x.tag, x.num_values, x.bitmask, x.eof) for x in
|
||||
index_header.tagx_entries]
|
||||
for record in records:
|
||||
raw = record.raw
|
||||
|
||||
if raw[:4] != b'INDX':
|
||||
raise ValueError('Invalid Primary Index Record')
|
||||
|
||||
parse_index_record(table, record.raw,
|
||||
index_header.tagx_control_byte_count, tags,
|
||||
index_header.index_encoding, strict=True)
|
||||
|
||||
self.indices = []
|
||||
|
||||
for ident, entry in table.iteritems():
|
||||
self.indices.append(IndexEntry(ident, entry, cncx))
|
||||
|
||||
def get_parent(self, index):
|
||||
if index.depth < 1:
|
||||
return None
|
||||
parent_depth = index.depth - 1
|
||||
for p in self.indices:
|
||||
if p.depth != parent_depth:
|
||||
continue
|
||||
|
||||
def __str__(self):
|
||||
ans = ['*'*20 + ' Index Entries (%d entries) '%len(self.indices)+ '*'*20]
|
||||
a = ans.append
|
||||
def u(w):
|
||||
a('Unknown: %r (%d bytes) (All zeros: %r)'%(w,
|
||||
len(w), not bool(w.replace(b'\0', b'')) ))
|
||||
for entry in self.indices:
|
||||
offset = entry.offset
|
||||
a(str(entry))
|
||||
t = self.alltext
|
||||
if offset is not None and self.alltext is not None:
|
||||
a('\tHTML before offset: %r'%t[offset-50:offset])
|
||||
a('\tHTML after offset: %r'%t[offset:offset+50])
|
||||
p = offset+entry.size
|
||||
a('\tHTML before end: %r'%t[p-50:p])
|
||||
a('\tHTML after end: %r'%t[p:p+50])
|
||||
|
||||
a('')
|
||||
|
||||
return '\n'.join(ans)
|
||||
|
||||
# }}}
|
||||
|
||||
class CNCX(object): # {{{
|
||||
|
||||
'''
|
||||
Parses the records that contain the compiled NCX (all strings from the
|
||||
NCX). Presents a simple offset : string mapping interface to access the
|
||||
data.
|
||||
'''
|
||||
|
||||
def __init__(self, records, codec):
|
||||
self.records = OrderedDict()
|
||||
record_offset = 0
|
||||
for record in records:
|
||||
raw = record.raw
|
||||
pos = 0
|
||||
while pos < len(raw):
|
||||
length, consumed = decint(raw[pos:])
|
||||
if length > 0:
|
||||
try:
|
||||
self.records[pos+record_offset] = raw[
|
||||
pos+consumed:pos+consumed+length].decode(codec)
|
||||
except:
|
||||
byts = raw[pos:]
|
||||
r = format_bytes(byts)
|
||||
print ('CNCX entry at offset %d has unknown format %s'%(
|
||||
pos+record_offset, r))
|
||||
self.records[pos+record_offset] = r
|
||||
pos = len(raw)
|
||||
pos += consumed+length
|
||||
record_offset += 0x10000
|
||||
|
||||
def __getitem__(self, offset):
|
||||
return self.records.get(offset)
|
||||
|
||||
def __str__(self):
|
||||
ans = ['*'*20 + ' cncx (%d strings) '%len(self.records)+ '*'*20]
|
||||
for k, v in self.records.iteritems():
|
||||
ans.append('%10d : %s'%(k, v))
|
||||
return '\n'.join(ans)
|
||||
|
||||
|
||||
# }}}
|
||||
|
||||
class ImageRecord(object): # {{{
|
||||
|
||||
def __init__(self, idx, record, fmt):
|
||||
self.raw = record.raw
|
||||
self.fmt = fmt
|
||||
self.idx = idx
|
||||
|
||||
def dump(self, folder):
|
||||
name = '%06d'%self.idx
|
||||
with open(os.path.join(folder, name+'.'+self.fmt), 'wb') as f:
|
||||
f.write(self.raw)
|
||||
|
||||
# }}}
|
||||
|
||||
class BinaryRecord(object): # {{{
|
||||
|
||||
def __init__(self, idx, record):
|
||||
self.raw = record.raw
|
||||
sig = self.raw[:4]
|
||||
name = '%06d'%idx
|
||||
if sig in {b'FCIS', b'FLIS', b'SRCS', b'DATP', b'RESC', b'BOUN',
|
||||
b'FDST', b'AUDI', b'VIDE',}:
|
||||
name += '-' + sig.decode('ascii')
|
||||
elif sig == b'\xe9\x8e\r\n':
|
||||
name += '-' + 'EOF'
|
||||
self.name = name
|
||||
|
||||
def dump(self, folder):
|
||||
with open(os.path.join(folder, self.name+'.bin'), 'wb') as f:
|
||||
f.write(self.raw)
|
||||
|
||||
# }}}
|
||||
|
||||
class FontRecord(object): # {{{
|
||||
|
||||
def __init__(self, idx, record):
|
||||
self.raw = record.raw
|
||||
name = '%06d'%idx
|
||||
self.font = read_font_record(self.raw)
|
||||
if self.font['err']:
|
||||
raise ValueError('Failed to read font record: %s Headers: %s'%(
|
||||
self.font['err'], self.font['headers']))
|
||||
self.payload = (self.font['font_data'] if self.font['font_data'] else
|
||||
self.font['raw_data'])
|
||||
self.name = '%s.%s'%(name, self.font['ext'])
|
||||
|
||||
def dump(self, folder):
|
||||
with open(os.path.join(folder, self.name), 'wb') as f:
|
||||
f.write(self.payload)
|
||||
|
||||
# }}}
|
||||
|
||||
class TBSIndexing(object): # {{{
|
||||
|
||||
def __init__(self, text_records, indices, doc_type):
|
||||
self.record_indices = OrderedDict()
|
||||
self.doc_type = doc_type
|
||||
self.indices = indices
|
||||
pos = 0
|
||||
for r in text_records:
|
||||
start = pos
|
||||
pos += len(r.raw)
|
||||
end = pos - 1
|
||||
self.record_indices[r] = x = {'starts':[], 'ends':[],
|
||||
'complete':[], 'geom': (start, end)}
|
||||
for entry in indices:
|
||||
istart, sz = entry.offset, entry.size
|
||||
iend = istart + sz - 1
|
||||
has_start = istart >= start and istart <= end
|
||||
has_end = iend >= start and iend <= end
|
||||
rec = None
|
||||
if has_start and has_end:
|
||||
rec = 'complete'
|
||||
elif has_start and not has_end:
|
||||
rec = 'starts'
|
||||
elif not has_start and has_end:
|
||||
rec = 'ends'
|
||||
if rec:
|
||||
x[rec].append(entry)
|
||||
|
||||
def get_index(self, idx):
|
||||
for i in self.indices:
|
||||
if i.index in {idx, unicode(idx)}: return i
|
||||
raise IndexError('Index %d not found'%idx)
|
||||
|
||||
def __str__(self):
|
||||
ans = ['*'*20 + ' TBS Indexing (%d records) '%len(self.record_indices)+ '*'*20]
|
||||
for r, dat in self.record_indices.iteritems():
|
||||
ans += self.dump_record(r, dat)[-1]
|
||||
return '\n'.join(ans)
|
||||
|
||||
def dump(self, bdir):
|
||||
types = defaultdict(list)
|
||||
for r, dat in self.record_indices.iteritems():
|
||||
tbs_type, strings = self.dump_record(r, dat)
|
||||
if tbs_type == 0: continue
|
||||
types[tbs_type] += strings
|
||||
for typ, strings in types.iteritems():
|
||||
with open(os.path.join(bdir, 'tbs_type_%d.txt'%typ), 'wb') as f:
|
||||
f.write('\n'.join(strings))
|
||||
|
||||
def dump_record(self, r, dat):
|
||||
ans = []
|
||||
ans.append('\nRecord #%d: Starts at: %d Ends at: %d'%(r.idx,
|
||||
dat['geom'][0], dat['geom'][1]))
|
||||
s, e, c = dat['starts'], dat['ends'], dat['complete']
|
||||
ans.append(('\tContains: %d index entries '
|
||||
'(%d ends, %d complete, %d starts)')%tuple(map(len, (s+e+c, e,
|
||||
c, s))))
|
||||
byts = bytearray(r.trailing_data.get('indexing', b''))
|
||||
ans.append('TBS bytes: %s'%format_bytes(byts))
|
||||
for typ, entries in (('Ends', e), ('Complete', c), ('Starts', s)):
|
||||
if entries:
|
||||
ans.append('\t%s:'%typ)
|
||||
for x in entries:
|
||||
ans.append(('\t\tIndex Entry: %s (Parent index: %s, '
|
||||
'Depth: %d, Offset: %d, Size: %d) [%s]')%(
|
||||
x.index, x.parent_index, x.depth, x.offset, x.size, x.label))
|
||||
def bin4(num):
|
||||
ans = bin(num)[2:]
|
||||
return bytes('0'*(4-len(ans)) + ans)
|
||||
|
||||
def repr_extra(x):
|
||||
return str({bin4(k):v for k, v in extra.iteritems()})
|
||||
|
||||
tbs_type = 0
|
||||
is_periodical = self.doc_type in (257, 258, 259)
|
||||
if len(byts):
|
||||
outermost_index, extra, consumed = decode_tbs(byts, flag_size=3)
|
||||
byts = byts[consumed:]
|
||||
for k in extra:
|
||||
tbs_type |= k
|
||||
ans.append('\nTBS: %d (%s)'%(tbs_type, bin4(tbs_type)))
|
||||
ans.append('Outermost index: %d'%outermost_index)
|
||||
ans.append('Unknown extra start bytes: %s'%repr_extra(extra))
|
||||
if is_periodical: # Hierarchical periodical
|
||||
try:
|
||||
byts, a = self.interpret_periodical(tbs_type, byts,
|
||||
dat['geom'][0])
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
a = []
|
||||
print ('Failed to decode TBS bytes for record: %d'%r.idx)
|
||||
ans += a
|
||||
if byts:
|
||||
sbyts = tuple(hex(b)[2:] for b in byts)
|
||||
ans.append('Remaining bytes: %s'%' '.join(sbyts))
|
||||
|
||||
ans.append('')
|
||||
return tbs_type, ans
|
||||
|
||||
def interpret_periodical(self, tbs_type, byts, record_offset):
|
||||
ans = []
|
||||
|
||||
def read_section_transitions(byts, psi=None): # {{{
|
||||
if psi is None:
|
||||
# Assume previous section is 1
|
||||
psi = self.get_index(1)
|
||||
|
||||
while byts:
|
||||
ai, extra, consumed = decode_tbs(byts)
|
||||
byts = byts[consumed:]
|
||||
if extra.get(0b0010, None) is not None:
|
||||
raise ValueError('Dont know how to interpret flag 0b0010'
|
||||
' while reading section transitions')
|
||||
if extra.get(0b1000, None) is not None:
|
||||
if len(extra) > 1:
|
||||
raise ValueError('Dont know how to interpret flags'
|
||||
' %r while reading section transitions'%extra)
|
||||
nsi = self.get_index(psi.index+1)
|
||||
ans.append('Last article in this record of section %d'
|
||||
' (relative to next section index [%d]): '
|
||||
'%d [%d absolute index]'%(psi.index, nsi.index, ai,
|
||||
ai+nsi.index))
|
||||
psi = nsi
|
||||
continue
|
||||
|
||||
ans.append('First article in this record of section %d'
|
||||
' (relative to its parent section): '
|
||||
'%d [%d absolute index]'%(psi.index, ai, ai+psi.index))
|
||||
|
||||
num = extra.get(0b0100, None)
|
||||
if num is None:
|
||||
msg = ('The section %d has at most one article'
|
||||
' in this record')%psi.index
|
||||
else:
|
||||
msg = ('Number of articles in this record of '
|
||||
'section %d: %d')%(psi.index, num)
|
||||
ans.append(msg)
|
||||
|
||||
offset = extra.get(0b0001, None)
|
||||
if offset is not None:
|
||||
if offset == 0:
|
||||
ans.append('This record is spanned by the article:'
|
||||
'%d'%(ai+psi.index))
|
||||
else:
|
||||
ans.append('->Offset to start of next section (%d) from start'
|
||||
' of record: %d [%d absolute offset]'%(psi.index+1,
|
||||
offset, offset+record_offset))
|
||||
return byts
|
||||
# }}}
|
||||
|
||||
def read_starting_section(byts): # {{{
|
||||
orig = byts
|
||||
si, extra, consumed = decode_tbs(byts)
|
||||
byts = byts[consumed:]
|
||||
if len(extra) > 1 or 0b0010 in extra or 0b1000 in extra:
|
||||
raise ValueError('Dont know how to interpret flags %r'
|
||||
' when reading starting section'%extra)
|
||||
si = self.get_index(si)
|
||||
ans.append('The section at the start of this record is:'
|
||||
' %s'%si.index)
|
||||
if 0b0100 in extra:
|
||||
num = extra[0b0100]
|
||||
ans.append('The number of articles from the section %d'
|
||||
' in this record: %s'%(si.index, num))
|
||||
elif 0b0001 in extra:
|
||||
eof = extra[0b0001]
|
||||
if eof != 0:
|
||||
raise ValueError('Unknown eof value %s when reading'
|
||||
' starting section. All bytes: %r'%(eof, orig))
|
||||
ans.append('??This record has more than one article from '
|
||||
' the section: %s'%si.index)
|
||||
return si, byts
|
||||
# }}}
|
||||
|
||||
if tbs_type & 0b0100:
|
||||
# Starting section is the first section
|
||||
ssi = self.get_index(1)
|
||||
else:
|
||||
ssi, byts = read_starting_section(byts)
|
||||
|
||||
byts = read_section_transitions(byts, ssi)
|
||||
|
||||
return byts, ans
|
||||
|
||||
# }}}
|
||||
|
||||
class MOBIFile(object): # {{{
|
||||
|
||||
def __init__(self, mf):
|
||||
for x in ('raw', 'palmdb', 'record_headers', 'records', 'mobi_header',
|
||||
'huffman_record_nums',):
|
||||
setattr(self, x, getattr(mf, x))
|
||||
|
||||
self.index_header = self.index_record = None
|
||||
self.indexing_record_nums = set()
|
||||
pir = self.mobi_header.primary_index_record
|
||||
if pir != NULL_INDEX:
|
||||
self.index_header = IndexHeader(self.records[pir])
|
||||
numi = self.index_header.index_count
|
||||
self.cncx = CNCX(self.records[
|
||||
pir+1+numi:pir+1+numi+self.index_header.num_of_cncx_blocks],
|
||||
self.index_header.index_encoding)
|
||||
self.index_record = IndexRecord(self.records[pir+1:pir+1+numi],
|
||||
self.index_header, self.cncx)
|
||||
self.indexing_record_nums = set(xrange(pir,
|
||||
pir+1+numi+self.index_header.num_of_cncx_blocks))
|
||||
self.secondary_index_record = self.secondary_index_header = None
|
||||
sir = self.mobi_header.secondary_index_record
|
||||
if sir != NULL_INDEX:
|
||||
self.secondary_index_header = SecondaryIndexHeader(self.records[sir])
|
||||
numi = self.secondary_index_header.index_count
|
||||
self.indexing_record_nums.add(sir)
|
||||
self.secondary_index_record = IndexRecord(
|
||||
self.records[sir+1:sir+1+numi], self.secondary_index_header, self.cncx)
|
||||
self.indexing_record_nums |= set(xrange(sir+1, sir+1+numi))
|
||||
|
||||
|
||||
ntr = self.mobi_header.number_of_text_records
|
||||
fntbr = self.mobi_header.first_non_book_record
|
||||
fii = self.mobi_header.first_image_index
|
||||
if fntbr == NULL_INDEX:
|
||||
fntbr = len(self.records)
|
||||
self.text_records = [TextRecord(r, self.records[r],
|
||||
self.mobi_header.extra_data_flags, mf.decompress6) for r in xrange(1,
|
||||
min(len(self.records), ntr+1))]
|
||||
self.image_records, self.binary_records = [], []
|
||||
self.font_records = []
|
||||
image_index = 0
|
||||
for i in xrange(fntbr, len(self.records)):
|
||||
if i in self.indexing_record_nums or i in self.huffman_record_nums:
|
||||
continue
|
||||
image_index += 1
|
||||
r = self.records[i]
|
||||
fmt = None
|
||||
if i >= fii and r.raw[:4] not in {b'FLIS', b'FCIS', b'SRCS',
|
||||
b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP',
|
||||
b'AUDI', b'VIDE', b'FONT'}:
|
||||
try:
|
||||
width, height, fmt = identify_data(r.raw)
|
||||
except:
|
||||
pass
|
||||
if fmt is not None:
|
||||
self.image_records.append(ImageRecord(image_index, r, fmt))
|
||||
elif r.raw[:4] == b'FONT':
|
||||
self.font_records.append(FontRecord(i, r))
|
||||
else:
|
||||
self.binary_records.append(BinaryRecord(i, r))
|
||||
|
||||
if self.index_record is not None:
|
||||
self.tbs_indexing = TBSIndexing(self.text_records,
|
||||
self.index_record.indices, self.mobi_header.type_raw)
|
||||
|
||||
def print_header(self, f=sys.stdout):
|
||||
print (str(self.palmdb).encode('utf-8'), file=f)
|
||||
print (file=f)
|
||||
print ('Record headers:', file=f)
|
||||
for i, r in enumerate(self.records):
|
||||
print ('%6d. %s'%(i, r.header), file=f)
|
||||
|
||||
print (file=f)
|
||||
print (str(self.mobi_header).encode('utf-8'), file=f)
|
||||
# }}}
|
||||
|
||||
def inspect_mobi(mobi_file, ddir):
|
||||
f = MOBIFile(mobi_file)
|
||||
with open(os.path.join(ddir, 'header.txt'), 'wb') as out:
|
||||
f.print_header(f=out)
|
||||
|
||||
alltext = os.path.join(ddir, 'text.html')
|
||||
with open(alltext, 'wb') as of:
|
||||
alltext = b''
|
||||
for rec in f.text_records:
|
||||
of.write(rec.raw)
|
||||
alltext += rec.raw
|
||||
of.seek(0)
|
||||
|
||||
root = html.fromstring(alltext.decode('utf-8'))
|
||||
with open(os.path.join(ddir, 'pretty.html'), 'wb') as of:
|
||||
of.write(html.tostring(root, pretty_print=True, encoding='utf-8',
|
||||
include_meta_content_type=True))
|
||||
|
||||
if f.index_header is not None:
|
||||
f.index_record.alltext = alltext
|
||||
with open(os.path.join(ddir, 'index.txt'), 'wb') as out:
|
||||
print(str(f.index_header), file=out)
|
||||
print('\n\n', file=out)
|
||||
if f.secondary_index_header is not None:
|
||||
print(str(f.secondary_index_header).encode('utf-8'), file=out)
|
||||
print('\n\n', file=out)
|
||||
if f.secondary_index_record is not None:
|
||||
print(str(f.secondary_index_record).encode('utf-8'), file=out)
|
||||
print('\n\n', file=out)
|
||||
print(str(f.cncx).encode('utf-8'), file=out)
|
||||
print('\n\n', file=out)
|
||||
print(str(f.index_record), file=out)
|
||||
with open(os.path.join(ddir, 'tbs_indexing.txt'), 'wb') as out:
|
||||
print(str(f.tbs_indexing), file=out)
|
||||
f.tbs_indexing.dump(ddir)
|
||||
|
||||
for tdir, attr in [('text', 'text_records'), ('images', 'image_records'),
|
||||
('binary', 'binary_records'), ('font', 'font_records')]:
|
||||
tdir = os.path.join(ddir, tdir)
|
||||
os.mkdir(tdir)
|
||||
for rec in getattr(f, attr):
|
||||
rec.dump(tdir)
|
||||
|
||||
|
||||
|
||||
# }}}
|
||||
|
||||
|
62
src/calibre/ebooks/mobi/debug/mobi8.py
Normal file
62
src/calibre/ebooks/mobi/debug/mobi8.py
Normal file
@ -0,0 +1,62 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.mobi.debug.headers import TextRecord
|
||||
|
||||
class MOBIFile(object):
|
||||
|
||||
def __init__(self, mf):
|
||||
self.mf = mf
|
||||
h, h8 = mf.mobi_header, mf.mobi8_header
|
||||
first_text_record = 1
|
||||
offset = 0
|
||||
res_end = len(mf.records)
|
||||
if mf.kf8_type == 'joint':
|
||||
offset = h.exth.kf8_header_index
|
||||
res_end = offset - 1
|
||||
|
||||
self.resource_records = mf.records[h.first_non_book_record:res_end]
|
||||
self.text_records = [TextRecord(i, r, h8.extra_data_flags,
|
||||
mf.decompress8) for i, r in
|
||||
enumerate(mf.records[first_text_record+offset:
|
||||
first_text_record+offset+h8.number_of_text_records])]
|
||||
|
||||
self.raw_text = b''.join(r.raw for r in self.text_records)
|
||||
|
||||
def print_header(self, f=sys.stdout):
|
||||
print (str(self.mf.palmdb).encode('utf-8'), file=f)
|
||||
print (file=f)
|
||||
print ('Record headers:', file=f)
|
||||
for i, r in enumerate(self.mf.records):
|
||||
print ('%6d. %s'%(i, r.header), file=f)
|
||||
|
||||
print (file=f)
|
||||
print (str(self.mf.mobi8_header).encode('utf-8'), file=f)
|
||||
|
||||
|
||||
def inspect_mobi(mobi_file, ddir):
|
||||
f = MOBIFile(mobi_file)
|
||||
with open(os.path.join(ddir, 'header.txt'), 'wb') as out:
|
||||
f.print_header(f=out)
|
||||
|
||||
alltext = os.path.join(ddir, 'raw_text.html')
|
||||
with open(alltext, 'wb') as of:
|
||||
of.write(f.raw_text)
|
||||
|
||||
for tdir, attr in [('text_records', 'text_records'), ('images',
|
||||
'image_records'), ('binary', 'binary_records'), ('font',
|
||||
'font_records')]:
|
||||
tdir = os.path.join(ddir, tdir)
|
||||
os.mkdir(tdir)
|
||||
for rec in getattr(f, attr, []):
|
||||
rec.dump(tdir)
|
||||
|
||||
|
@ -186,20 +186,16 @@ class BookHeader(object):
|
||||
if len(raw) >= 0xF8:
|
||||
self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4)
|
||||
|
||||
if self.mobi_version >= 8:
|
||||
self.skelidx, = struct.unpack_from('>L', raw, 0xFC)
|
||||
|
||||
# Index into <div> sections in raw_ml
|
||||
self.dividx, = struct.unpack_from('>L', raw, 0xF8)
|
||||
|
||||
# Index into Other files
|
||||
self.othidx, = struct.unpack_from('>L', raw, 0x104)
|
||||
# Ancient PRC files from Baen can have random values for
|
||||
# mobi_version, so be conservative
|
||||
if self.mobi_version == 8 and len(raw) >= (0xF8 + 16):
|
||||
self.dividx, self.skelidx, self.datpidx, self.othidx = \
|
||||
struct.unpack_from(b'>4L', raw, 0xF8)
|
||||
|
||||
# need to use the FDST record to find out how to properly
|
||||
# unpack the raw_ml into pieces it is simply a table of start
|
||||
# and end locations for each flow piece
|
||||
self.fdstidx, = struct.unpack_from('>L', raw, 0xC0)
|
||||
self.fdstcnt, = struct.unpack_from('>L', raw, 0xC4)
|
||||
self.fdstidx, self.fdstcnt = struct.unpack_from(b'>2L', raw, 0xC0)
|
||||
# if cnt is 1 or less, fdst section number can be garbage
|
||||
if self.fdstcnt <= 1:
|
||||
self.fdstidx = NULL_INDEX
|
||||
|
@ -8,9 +8,13 @@ __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import struct
|
||||
from collections import OrderedDict
|
||||
from collections import OrderedDict, namedtuple
|
||||
|
||||
from calibre.ebooks.mobi.utils import decint, count_set_bits
|
||||
from calibre.ebooks.mobi.utils import (decint, count_set_bits,
|
||||
decode_string)
|
||||
|
||||
TagX = namedtuple('TagX', 'tag num_of_values bitmask eof')
|
||||
PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values')
|
||||
|
||||
class InvalidFile(ValueError):
|
||||
pass
|
||||
@ -37,9 +41,8 @@ def parse_indx_header(data):
|
||||
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
|
||||
)
|
||||
num = len(words)
|
||||
values = struct.unpack(b'>%dL' % num, data[4:4*(num+1)])
|
||||
header = {words[i]:values[i] for i in xrange(num)}
|
||||
return header
|
||||
values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
|
||||
return dict(zip(words, values))
|
||||
|
||||
class CNCX(object): # {{{
|
||||
|
||||
@ -77,81 +80,116 @@ class CNCX(object): # {{{
|
||||
return self.records.get(offset, default)
|
||||
# }}}
|
||||
|
||||
def parse_tag_section(data):
|
||||
def parse_tagx_section(data):
|
||||
check_signature(data, b'TAGX')
|
||||
|
||||
tags = []
|
||||
first_entry_offset, = struct.unpack_from(b'>L', data, 0x04)
|
||||
control_byte_count, = struct.unpack_from(b'>L', data, 0x08)
|
||||
first_entry_offset, = struct.unpack_from(b'>L', data, 4)
|
||||
control_byte_count, = struct.unpack_from(b'>L', data, 8)
|
||||
|
||||
# Skip the first 12 bytes already read above.
|
||||
for i in xrange(12, first_entry_offset, 4):
|
||||
pos = i
|
||||
tags.append((ord(data[pos]), ord(data[pos+1]), ord(data[pos+2]),
|
||||
ord(data[pos+3])))
|
||||
vals = list(bytearray(data[i:i+4]))
|
||||
tags.append(TagX(*vals))
|
||||
return control_byte_count, tags
|
||||
|
||||
def get_tag_map(control_byte_count, tags, data, start, end):
|
||||
def get_tag_map(control_byte_count, tagx, data, strict=False):
|
||||
ptags = []
|
||||
ans = {}
|
||||
control_byte_index = 0
|
||||
data_start = start + control_byte_count
|
||||
control_bytes = list(bytearray(data[:control_byte_count]))
|
||||
data = data[control_byte_count:]
|
||||
|
||||
for tag, values_per_entry, mask, end_flag in tags:
|
||||
if end_flag == 0x01:
|
||||
control_byte_index += 1
|
||||
for x in tagx:
|
||||
if x.eof == 0x01:
|
||||
control_bytes = control_bytes[1:]
|
||||
continue
|
||||
value = ord(data[start + control_byte_index]) & mask
|
||||
value = control_bytes[0] & x.bitmask
|
||||
if value != 0:
|
||||
if value == mask:
|
||||
if count_set_bits(mask) > 1:
|
||||
value_count = value_bytes = None
|
||||
if value == x.bitmask:
|
||||
if count_set_bits(x.bitmask) > 1:
|
||||
# If all bits of masked value are set and the mask has more
|
||||
# than one bit, a variable width value will follow after
|
||||
# the control bytes which defines the length of bytes (NOT
|
||||
# the value count!) which will contain the corresponding
|
||||
# variable width values.
|
||||
value, consumed = decint(data[data_start:])
|
||||
data_start += consumed
|
||||
ptags.append((tag, None, value, values_per_entry))
|
||||
value_bytes, consumed = decint(data)
|
||||
data = data[consumed:]
|
||||
else:
|
||||
ptags.append((tag, 1, None, values_per_entry))
|
||||
value_count = 1
|
||||
else:
|
||||
# Shift bits to get the masked value.
|
||||
while mask & 0x01 == 0:
|
||||
mask = mask >> 1
|
||||
value = value >> 1
|
||||
ptags.append((tag, value, None, values_per_entry))
|
||||
for tag, value_count, value_bytes, values_per_entry in ptags:
|
||||
mask = x.bitmask
|
||||
while mask & 0b1 == 0:
|
||||
mask >>= 1
|
||||
value >>= 1
|
||||
value_count = value
|
||||
ptags.append(PTagX(x.tag, value_count, value_bytes,
|
||||
x.num_of_values))
|
||||
|
||||
for x in ptags:
|
||||
values = []
|
||||
if value_count != None:
|
||||
if x.value_count is not None:
|
||||
# Read value_count * values_per_entry variable width values.
|
||||
for _ in xrange(value_count*values_per_entry):
|
||||
byts, consumed = decint(data[data_start:])
|
||||
data_start += consumed
|
||||
for _ in xrange(x.value_count * x.num_of_values):
|
||||
byts, consumed = decint(data)
|
||||
data = data[consumed:]
|
||||
values.append(byts)
|
||||
else:
|
||||
else: # value_bytes is not None
|
||||
# Convert value_bytes to variable width values.
|
||||
total_consumed = 0
|
||||
while total_consumed < value_bytes:
|
||||
while total_consumed < x.value_bytes:
|
||||
# Does this work for values_per_entry != 1?
|
||||
byts, consumed = decint(data[data_start:])
|
||||
data_start += consumed
|
||||
byts, consumed = decint(data)
|
||||
data = data[consumed:]
|
||||
total_consumed += consumed
|
||||
values.append(byts)
|
||||
if total_consumed != value_bytes:
|
||||
print ("Error: Should consume %s bytes, but consumed %s" %
|
||||
(value_bytes, total_consumed))
|
||||
ans[tag] = values
|
||||
# Test that all bytes have been processed if end is given.
|
||||
if end is not None and data_start < end:
|
||||
# The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
|
||||
rest = data[data_start:end]
|
||||
if rest.replace(b'\0', b''):
|
||||
print ("Warning: There are unprocessed index bytes left: %s" %
|
||||
format_bytes(rest))
|
||||
if total_consumed != x.value_bytes:
|
||||
err = ("Error: Should consume %s bytes, but consumed %s" %
|
||||
(x.value_bytes, total_consumed))
|
||||
if strict:
|
||||
raise ValueError(err)
|
||||
else:
|
||||
print(err)
|
||||
ans[x.tag] = values
|
||||
# Test that all bytes have been processed
|
||||
if data.replace(b'\0', b''):
|
||||
err = ("Warning: There are unprocessed index bytes left: %s" %
|
||||
format_bytes(data))
|
||||
if strict:
|
||||
raise ValueError(err)
|
||||
else:
|
||||
print(err)
|
||||
|
||||
return ans
|
||||
|
||||
def parse_index_record(table, data, control_byte_count, tags, codec,
|
||||
strict=False):
|
||||
header = parse_indx_header(data)
|
||||
idxt_pos = header['start']
|
||||
if data[idxt_pos:idxt_pos+4] != b'IDXT':
|
||||
print ('WARNING: Invalid INDX record')
|
||||
entry_count = header['count']
|
||||
|
||||
# loop through to build up the IDXT position starts
|
||||
idx_positions= []
|
||||
for j in xrange(entry_count):
|
||||
pos, = struct.unpack_from(b'>H', data, idxt_pos + 4 + (2 * j))
|
||||
idx_positions.append(pos)
|
||||
# The last entry ends before the IDXT tag (but there might be zero fill
|
||||
# bytes we need to ignore!)
|
||||
idx_positions.append(idxt_pos)
|
||||
|
||||
# For each entry in the IDXT build up the tag map and any associated
|
||||
# text
|
||||
for j in xrange(entry_count):
|
||||
start, end = idx_positions[j:j+2]
|
||||
rec = data[start:end]
|
||||
ident, consumed = decode_string(rec, codec=codec)
|
||||
rec = rec[consumed:]
|
||||
tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
|
||||
table[ident] = tag_map
|
||||
|
||||
|
||||
def read_index(sections, idx, codec):
|
||||
table, cncx = OrderedDict(), CNCX([], codec)
|
||||
|
||||
@ -166,32 +204,11 @@ def read_index(sections, idx, codec):
|
||||
cncx = CNCX(cncx_records, codec)
|
||||
|
||||
tag_section_start = indx_header['len']
|
||||
control_byte_count, tags = parse_tag_section(data[tag_section_start:])
|
||||
control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
|
||||
|
||||
for i in xrange(idx + 1, idx + 1 + indx_count):
|
||||
# Index record
|
||||
data = sections[i][0]
|
||||
header = parse_indx_header(data)
|
||||
idxt_pos = header['start']
|
||||
entry_count = header['count']
|
||||
|
||||
# loop through to build up the IDXT position starts
|
||||
idx_positions= []
|
||||
for j in xrange(entry_count):
|
||||
pos, = struct.unpack_from(b'>H', data, idxt_pos + 4 + (2 * j))
|
||||
idx_positions.append(pos)
|
||||
# The last entry ends before the IDXT tag (but there might be zero fill
|
||||
# bytes we need to ignore!)
|
||||
idx_positions.append(idxt_pos)
|
||||
|
||||
# For each entry in the IDXT build up the tag map and any associated
|
||||
# text
|
||||
for j in xrange(entry_count):
|
||||
start, end = idx_positions[j:j+2]
|
||||
text_length = ord(data[start])
|
||||
text = data[start+1:start+1+text_length]
|
||||
tag_map = get_tag_map(control_byte_count, tags, data,
|
||||
start+1+text_length, end)
|
||||
table[text] = tag_map
|
||||
|
||||
parse_index_record(table, data, control_byte_count, tags, codec)
|
||||
return table, cncx
|
||||
|
||||
|
@ -33,9 +33,11 @@ def update_internal_links(mobi8_reader):
|
||||
for m in posfid_index_pattern.finditer(tag):
|
||||
posfid = m.group(1)
|
||||
offset = m.group(2)
|
||||
filename, idtag = mr.get_id_tag_by_pos_fid(posfid, offset)
|
||||
filename, idtag = mr.get_id_tag_by_pos_fid(int(posfid, 32),
|
||||
int(offset, 32))
|
||||
suffix = (b'#' + idtag) if idtag else b''
|
||||
replacement = filename.encode(mr.header.codec) + suffix
|
||||
replacement = filename.split('/')[-1].encode(
|
||||
mr.header.codec) + suffix
|
||||
tag = posfid_index_pattern.sub(replacement, tag, 1)
|
||||
srcpieces[j] = tag
|
||||
part = ''.join([x.decode(mr.header.codec) for x in srcpieces])
|
||||
|
@ -107,7 +107,10 @@ class MobiReader(object):
|
||||
self.kf8_type = None
|
||||
k8i = getattr(self.book_header.exth, 'kf8_header', None)
|
||||
|
||||
if self.book_header.mobi_version == 8:
|
||||
# Ancient PRC files from Baen can have random values for
|
||||
# mobi_version, so be conservative
|
||||
if (self.book_header.mobi_version == 8 and hasattr(self.book_header,
|
||||
'skelidx')):
|
||||
self.kf8_type = 'standalone'
|
||||
elif k8i is not None: # Check for joint mobi 6 and kf 8 file
|
||||
try:
|
||||
@ -118,12 +121,17 @@ class MobiReader(object):
|
||||
try:
|
||||
self.book_header = BookHeader(self.sections[k8i][0],
|
||||
self.ident, user_encoding, self.log)
|
||||
# The following are only correct in the Mobi 6
|
||||
# header not the Mobi 8 header
|
||||
|
||||
# Only the first_image_index from the MOBI 6 header is
|
||||
# useful
|
||||
for x in ('first_image_index',):
|
||||
setattr(self.book_header, x, getattr(bh, x))
|
||||
|
||||
# We need to do this because the MOBI 6 text extract code
|
||||
# does not know anything about the kf8 offset
|
||||
if hasattr(self.book_header, 'huff_offset'):
|
||||
self.book_header.huff_offset += k8i
|
||||
|
||||
self.kf8_type = 'joint'
|
||||
self.kf8_boundary = k8i-1
|
||||
except:
|
||||
|
@ -33,6 +33,7 @@ class Mobi8Reader(object):
|
||||
def __init__(self, mobi6_reader, log):
|
||||
self.mobi6_reader, self.log = mobi6_reader, log
|
||||
self.header = mobi6_reader.book_header
|
||||
self.encrypted_fonts = []
|
||||
|
||||
def __call__(self):
|
||||
self.mobi6_reader.check_for_drm()
|
||||
@ -229,11 +230,9 @@ class Mobi8Reader(object):
|
||||
|
||||
def get_id_tag_by_pos_fid(self, posfid, offset):
|
||||
# first convert kindle:pos:fid and offset info to position in file
|
||||
row = int(posfid, 32)
|
||||
off = int(offset, 32)
|
||||
[insertpos, idtext, filenum, seqnm, startpos, length] = self.elems[row]
|
||||
pos = insertpos + off
|
||||
fname = self.get_file_info(pos).filename
|
||||
insertpos, idtext, filenum, seqnm, startpos, length = self.elems[posfid]
|
||||
pos = insertpos + offset
|
||||
fi = self.get_file_info(pos)
|
||||
# an existing "id=" must exist in original xhtml otherwise it would not
|
||||
# have worked for linking. Amazon seems to have added its own
|
||||
# additional "aid=" inside tags whose contents seem to represent some
|
||||
@ -242,7 +241,7 @@ class Mobi8Reader(object):
|
||||
# so find the closest "id=" before position the file by actually
|
||||
# searching in that file
|
||||
idtext = self.get_id_tag(pos)
|
||||
return fname, idtext
|
||||
return '%s/%s'%(fi.type, fi.filename), idtext
|
||||
|
||||
def get_id_tag(self, pos):
|
||||
# find the correct tag by actually searching in the destination
|
||||
@ -253,12 +252,13 @@ class Mobi8Reader(object):
|
||||
textblock = self.parts[fi.num]
|
||||
id_map = []
|
||||
npos = pos - fi.start
|
||||
# if npos inside a tag then search all text before the its end of tag
|
||||
# marker
|
||||
pgt = textblock.find(b'>', npos)
|
||||
plt = textblock.find(b'<', npos)
|
||||
if pgt < plt:
|
||||
# if npos inside a tag then search all text before the its end of tag marker
|
||||
# else not in a tag need to search the preceding tag
|
||||
if plt == npos or pgt < plt:
|
||||
npos = pgt + 1
|
||||
textblock = textblock[0:npos]
|
||||
# find id links only inside of tags
|
||||
# inside any < > pair find all "id=' and return whatever is inside
|
||||
# the quotes
|
||||
@ -315,12 +315,18 @@ class Mobi8Reader(object):
|
||||
|
||||
# Add href and anchor info to the index entries
|
||||
for entry in index_entries:
|
||||
pos = entry['pos']
|
||||
fi = self.get_file_info(pos)
|
||||
if fi.filename is None:
|
||||
raise ValueError('Index entry has invalid pos: %d'%pos)
|
||||
idtag = self.get_id_tag(pos).decode(self.header.codec)
|
||||
entry['href'] = '%s/%s'%(fi.type, fi.filename)
|
||||
pos_fid = entry['pos_fid']
|
||||
if pos_fid is None:
|
||||
pos = entry['pos']
|
||||
fi = self.get_file_info(pos)
|
||||
if fi.filename is None:
|
||||
raise ValueError('Index entry has invalid pos: %d'%pos)
|
||||
idtag = self.get_id_tag(pos).decode(self.header.codec)
|
||||
href = '%s/%s'%(fi.type, fi.filename)
|
||||
else:
|
||||
href, idtag = self.get_id_tag_by_pos_fid(*pos_fid)
|
||||
|
||||
entry['href'] = href
|
||||
entry['idtag'] = idtag
|
||||
|
||||
# Build the TOC object
|
||||
@ -350,6 +356,8 @@ class Mobi8Reader(object):
|
||||
with open(href.replace('/', os.sep), 'wb') as f:
|
||||
f.write(font['font_data'] if font['font_data'] else
|
||||
font['raw_data'])
|
||||
if font['encrypted']:
|
||||
self.encrypted_fonts.append(href)
|
||||
else:
|
||||
imgtype = imghdr.what(None, data)
|
||||
if imgtype is None:
|
||||
|
@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en'
|
||||
import os
|
||||
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.mobi.utils import to_base
|
||||
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
|
||||
from calibre.ebooks.mobi.reader.index import read_index
|
||||
|
||||
@ -23,7 +22,30 @@ tag_fieldname_map = {
|
||||
6: ['pos_fid',0],
|
||||
21: ['parent',0],
|
||||
22: ['child1',0],
|
||||
23: ['childn',0]
|
||||
23: ['childn',0],
|
||||
69: ['image_index',0],
|
||||
70 : ['desc_offset', 0], # 'Description offset in cncx'
|
||||
71 : ['author_offset', 0], # 'Author offset in cncx'
|
||||
72 : ['image_caption_offset', 0], # 'Image caption offset in cncx',
|
||||
73 : ['image_attr_offset', 0], # 'Image attribution offset in cncx',
|
||||
|
||||
}
|
||||
|
||||
default_entry = {
|
||||
'pos': -1,
|
||||
'len': 0,
|
||||
'noffs': -1,
|
||||
'text' : "Unknown Text",
|
||||
'hlvl' : -1,
|
||||
'kind' : "Unknown Class",
|
||||
'pos_fid' : None,
|
||||
'parent' : -1,
|
||||
'child1' : -1,
|
||||
'childn' : -1,
|
||||
'description': None,
|
||||
'author': None,
|
||||
'image_caption': None,
|
||||
'image_attribution': None,
|
||||
}
|
||||
|
||||
def read_ncx(sections, index, codec):
|
||||
@ -34,32 +56,25 @@ def read_ncx(sections, index, codec):
|
||||
|
||||
for num, x in enumerate(table.iteritems()):
|
||||
text, tag_map = x
|
||||
entry = {
|
||||
'name': text,
|
||||
'pos': -1,
|
||||
'len': 0,
|
||||
'noffs': -1,
|
||||
'text' : "Unknown Text",
|
||||
'hlvl' : -1,
|
||||
'kind' : "Unknown Kind",
|
||||
'pos_fid' : None,
|
||||
'parent' : -1,
|
||||
'child1' : -1,
|
||||
'childn' : -1,
|
||||
'num' : num
|
||||
}
|
||||
entry = default_entry.copy()
|
||||
entry['name'] = text
|
||||
entry['num'] = num
|
||||
|
||||
for tag in tag_fieldname_map.keys():
|
||||
for tag in tag_fieldname_map.iterkeys():
|
||||
fieldname, i = tag_fieldname_map[tag]
|
||||
if tag in tag_map:
|
||||
fieldvalue = tag_map[tag][i]
|
||||
if tag == 6:
|
||||
fieldvalue = to_base(fieldvalue, base=32)
|
||||
# Appears to be an idx into the KF8 elems table with an
|
||||
# offset
|
||||
fieldvalue = tuple(tag_map[tag])
|
||||
entry[fieldname] = fieldvalue
|
||||
if tag == 3:
|
||||
entry['text'] = cncx.get(fieldvalue, 'Unknown Text')
|
||||
if tag == 5:
|
||||
entry['kind'] = cncx.get(fieldvalue, 'Unknown Kind')
|
||||
for which, name in {3:'text', 5:'kind', 70:'description',
|
||||
71:'author', 72:'image_caption',
|
||||
73:'image_attribution'}.iteritems():
|
||||
if tag == which:
|
||||
entry[name] = cncx.get(fieldvalue,
|
||||
default_entry[name])
|
||||
index_entries.append(entry)
|
||||
|
||||
return index_entries
|
||||
|
@ -15,7 +15,13 @@ from calibre.ebooks import normalize
|
||||
|
||||
IMAGE_MAX_SIZE = 10 * 1024 * 1024
|
||||
|
||||
def decode_hex_number(raw):
|
||||
def decode_string(raw, codec='utf-8'):
|
||||
length, = struct.unpack(b'>B', raw[0])
|
||||
raw = raw[1:1+length]
|
||||
consumed = length+1
|
||||
return raw.decode(codec), consumed
|
||||
|
||||
def decode_hex_number(raw, codec='utf-8'):
|
||||
'''
|
||||
Return a variable length number encoded using hexadecimal encoding. These
|
||||
numbers have the first byte which tells the number of bytes that follow.
|
||||
@ -25,13 +31,16 @@ def decode_hex_number(raw):
|
||||
:param raw: Raw binary data as a bytestring
|
||||
|
||||
:return: The number and the number of bytes from raw that the number
|
||||
occupies
|
||||
occupies.
|
||||
'''
|
||||
length, = struct.unpack(b'>B', raw[0])
|
||||
raw = raw[1:1+length]
|
||||
consumed = length+1
|
||||
raw, consumed = decode_string(raw, codec=codec)
|
||||
return int(raw, 16), consumed
|
||||
|
||||
def encode_string(raw):
|
||||
ans = bytearray(bytes(raw))
|
||||
ans.insert(0, len(ans))
|
||||
return bytes(ans)
|
||||
|
||||
def encode_number_as_hex(num):
|
||||
'''
|
||||
Encode num as a variable length encoded hexadecimal number. Returns the
|
||||
@ -44,9 +53,7 @@ def encode_number_as_hex(num):
|
||||
nlen = len(num)
|
||||
if nlen % 2 != 0:
|
||||
num = b'0'+num
|
||||
ans = bytearray(num)
|
||||
ans.insert(0, len(num))
|
||||
return bytes(ans)
|
||||
return encode_string(num)
|
||||
|
||||
def encint(value, forward=True):
|
||||
'''
|
||||
@ -430,7 +437,7 @@ def read_font_record(data, extent=1040): # {{{
|
||||
# The zlib compressed data begins with 2 bytes of header and
|
||||
# has 4 bytes of checksum at the end
|
||||
ans = {'raw_data':data, 'font_data':None, 'err':None, 'ext':'failed',
|
||||
'headers':None}
|
||||
'headers':None, 'encrypted':False}
|
||||
|
||||
try:
|
||||
usize, flags, dstart, xor_len, xor_start = struct.unpack_from(
|
||||
@ -453,6 +460,7 @@ def read_font_record(data, extent=1040): # {{{
|
||||
buf[n] ^= key[n%xor_len] # XOR of buf and key
|
||||
|
||||
font_data = bytes(buf)
|
||||
ans['encrypted'] = True
|
||||
|
||||
if flags & 0b1:
|
||||
# ZLIB compressed data
|
||||
|
@ -234,13 +234,15 @@ class RTFMLizer(object):
|
||||
# Process tags that need special processing and that do not have inner
|
||||
# text. Usually these require an argument
|
||||
if tag == 'img':
|
||||
src = os.path.basename(elem.get('src'))
|
||||
block_start = ''
|
||||
block_end = ''
|
||||
if 'block' not in tag_stack:
|
||||
block_start = '{\\par\\pard\\hyphpar '
|
||||
block_end = '}'
|
||||
text += '%s SPECIAL_IMAGE-%s-REPLACE_ME %s' % (block_start, src, block_end)
|
||||
src = elem.get('src')
|
||||
if src:
|
||||
src = os.path.basename(elem.get('src'))
|
||||
block_start = ''
|
||||
block_end = ''
|
||||
if 'block' not in tag_stack:
|
||||
block_start = '{\\par\\pard\\hyphpar '
|
||||
block_end = '}'
|
||||
text += '%s SPECIAL_IMAGE-%s-REPLACE_ME %s' % (block_start, src, block_end)
|
||||
|
||||
single_tag = SINGLE_TAGS.get(tag, None)
|
||||
if single_tag:
|
||||
|
@ -70,6 +70,9 @@ class AddAction(InterfaceAction):
|
||||
self.add_menu.addSeparator()
|
||||
ma('add-formats', _('Add files to selected book records'),
|
||||
triggered=self.add_formats, shortcut=_('Shift+A'))
|
||||
self.add_menu.addSeparator()
|
||||
ma('add-config', _('Configure the adding of books'),
|
||||
triggered=self.add_config)
|
||||
|
||||
self.qaction.triggered.connect(self.add_books)
|
||||
|
||||
@ -78,6 +81,11 @@ class AddAction(InterfaceAction):
|
||||
for action in list(self.add_menu.actions())[1:]:
|
||||
action.setEnabled(enabled)
|
||||
|
||||
def add_config(self):
|
||||
self.gui.iactions['Preferences'].do_config(
|
||||
initial_plugin=('Import/Export', 'Adding'),
|
||||
close_after_initial=True)
|
||||
|
||||
def add_formats(self, *args):
|
||||
if self.gui.stack.currentIndex() != 0:
|
||||
return
|
||||
|
@ -3,7 +3,7 @@
|
||||
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, Tomasz Długosz <tomek3d@gmail.com>'
|
||||
__copyright__ = '2011-2012, Tomasz Długosz <tomek3d@gmail.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
@ -47,41 +47,47 @@ class NextoStore(BasicStoreConfig, StorePlugin):
|
||||
url = 'http://www.nexto.pl/szukaj.xml?search-clause=' + urllib.quote_plus(query) + '&scid=1015'
|
||||
|
||||
br = browser()
|
||||
offset=0
|
||||
|
||||
counter = max_results
|
||||
with closing(br.open(url, timeout=timeout)) as f:
|
||||
doc = html.fromstring(f.read())
|
||||
for data in doc.xpath('//ul[@class="productslist"]/li'):
|
||||
if counter <= 0:
|
||||
|
||||
while counter:
|
||||
with closing(br.open(url + '&_offset=' + str(offset), timeout=timeout)) as f:
|
||||
doc = html.fromstring(f.read())
|
||||
for data in doc.xpath('//ul[@class="productslist"]/li'):
|
||||
if counter <= 0:
|
||||
break
|
||||
|
||||
id = ''.join(data.xpath('.//div[@class="cover_container"]/a[1]/@href'))
|
||||
if not id:
|
||||
continue
|
||||
|
||||
price = ''.join(data.xpath('.//strong[@class="nprice"]/text()'))
|
||||
|
||||
cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src'))
|
||||
title = ''.join(data.xpath('.//a[@class="title"]/text()'))
|
||||
title = re.sub(r' - ebook$', '', title)
|
||||
formats = ', '.join(data.xpath('.//ul[@class="formats_available"]/li//b/text()'))
|
||||
DrmFree = re.search(r'bez.DRM', formats)
|
||||
formats = re.sub(r'\(.+\)', '', formats)
|
||||
|
||||
author = ''
|
||||
with closing(br.open('http://www.nexto.pl/' + id.strip(), timeout=timeout/4)) as nf:
|
||||
idata = html.fromstring(nf.read())
|
||||
author = ', '.join(idata.xpath('//div[@class="basic_data"]/p[1]/b/a/text()'))
|
||||
|
||||
counter -= 1
|
||||
|
||||
s = SearchResult()
|
||||
s.cover_url = cover_url
|
||||
s.title = title.strip()
|
||||
s.author = author.strip()
|
||||
s.price = price
|
||||
s.detail_item = id.strip()
|
||||
s.drm = SearchResult.DRM_UNLOCKED if DrmFree else SearchResult.DRM_LOCKED
|
||||
s.formats = formats.upper().strip()
|
||||
|
||||
yield s
|
||||
if not doc.xpath('//div[@class="listnavigator"]//a[@class="next"]'):
|
||||
break
|
||||
|
||||
id = ''.join(data.xpath('.//div[@class="cover_container"]/a[1]/@href'))
|
||||
if not id:
|
||||
continue
|
||||
|
||||
price = ''.join(data.xpath('.//strong[@class="nprice"]/text()'))
|
||||
|
||||
cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src'))
|
||||
title = ''.join(data.xpath('.//a[@class="title"]/text()'))
|
||||
title = re.sub(r' - ebook$', '', title)
|
||||
formats = ', '.join(data.xpath('.//ul[@class="formats_available"]/li//b/text()'))
|
||||
DrmFree = re.search(r'bez.DRM', formats)
|
||||
formats = re.sub(r'\(.+\)', '', formats)
|
||||
|
||||
author = ''
|
||||
with closing(br.open('http://www.nexto.pl/' + id.strip(), timeout=timeout/4)) as nf:
|
||||
idata = html.fromstring(nf.read())
|
||||
author = ', '.join(idata.xpath('//div[@class="basic_data"]/p[1]/b/a/text()'))
|
||||
|
||||
counter -= 1
|
||||
|
||||
s = SearchResult()
|
||||
s.cover_url = cover_url
|
||||
s.title = title.strip()
|
||||
s.author = author.strip()
|
||||
s.price = price
|
||||
s.detail_item = id.strip()
|
||||
s.drm = SearchResult.DRM_UNLOCKED if DrmFree else SearchResult.DRM_LOCKED
|
||||
s.formats = formats.upper().strip()
|
||||
|
||||
yield s
|
||||
offset+=10
|
||||
|
@ -255,7 +255,10 @@
|
||||
</widget>
|
||||
</item>
|
||||
<item row="3" column="1">
|
||||
<widget class="QSpinBox" name="max_view_width">
|
||||
<widget class="QSpinBox" name="max_fs_width">
|
||||
<property name="toolTip">
|
||||
<string>Set the maximum width that the book's text and pictures will take when in fullscreen mode. This allows you to read the book text without it becoming too wide.</string>
|
||||
</property>
|
||||
<property name="suffix">
|
||||
<string> px</string>
|
||||
</property>
|
||||
@ -270,10 +273,10 @@
|
||||
<item row="3" column="0">
|
||||
<widget class="QLabel" name="label_7">
|
||||
<property name="text">
|
||||
<string>Maximum &view width:</string>
|
||||
<string>Maximum text width in &fullscreen:</string>
|
||||
</property>
|
||||
<property name="buddy">
|
||||
<cstring>max_view_width</cstring>
|
||||
<cstring>max_fs_width</cstring>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
@ -350,7 +353,7 @@
|
||||
<tabstop>serif_family</tabstop>
|
||||
<tabstop>sans_family</tabstop>
|
||||
<tabstop>mono_family</tabstop>
|
||||
<tabstop>max_view_width</tabstop>
|
||||
<tabstop>max_fs_width</tabstop>
|
||||
<tabstop>opt_remember_window_size</tabstop>
|
||||
<tabstop>buttonBox</tabstop>
|
||||
</tabstops>
|
||||
|
@ -12,7 +12,7 @@ from PyQt4.Qt import (QSize, QSizePolicy, QUrl, SIGNAL, Qt, QTimer,
|
||||
QPainter, QPalette, QBrush, QFontDatabase, QDialog,
|
||||
QColor, QPoint, QImage, QRegion, QVariant, QIcon,
|
||||
QFont, pyqtSignature, QAction, QByteArray, QMenu,
|
||||
pyqtSignal, QSwipeGesture)
|
||||
pyqtSignal, QSwipeGesture, QApplication)
|
||||
from PyQt4.QtWebKit import QWebPage, QWebView, QWebSettings
|
||||
|
||||
from calibre.utils.config import Config, StringConfig
|
||||
@ -46,8 +46,10 @@ def config(defaults=None):
|
||||
help=_('Remember last used window size'))
|
||||
c.add_opt('user_css', default='',
|
||||
help=_('Set the user CSS stylesheet. This can be used to customize the look of all books.'))
|
||||
c.add_opt('max_view_width', default=6000,
|
||||
help=_('Maximum width of the viewer window, in pixels.'))
|
||||
c.add_opt('max_fs_width', default=800,
|
||||
help=_("Set the maximum width that the book's text and pictures will take"
|
||||
" when in fullscreen mode. This allows you to read the book text"
|
||||
" without it becoming too wide."))
|
||||
c.add_opt('fit_images', default=True,
|
||||
help=_('Resize images larger than the viewer window to fit inside it'))
|
||||
c.add_opt('hyphenate', default=False, help=_('Hyphenate text'))
|
||||
@ -101,7 +103,7 @@ class ConfigDialog(QDialog, Ui_Dialog):
|
||||
self.standard_font.setCurrentIndex({'serif':0, 'sans':1, 'mono':2}[opts.standard_font])
|
||||
self.css.setPlainText(opts.user_css)
|
||||
self.css.setToolTip(_('Set the user CSS stylesheet. This can be used to customize the look of all books.'))
|
||||
self.max_view_width.setValue(opts.max_view_width)
|
||||
self.max_fs_width.setValue(opts.max_fs_width)
|
||||
with zipfile.ZipFile(P('viewer/hyphenate/patterns.zip',
|
||||
allow_user_override=False), 'r') as zf:
|
||||
pats = [x.split('.')[0].replace('-', '_') for x in zf.namelist()]
|
||||
@ -144,7 +146,7 @@ class ConfigDialog(QDialog, Ui_Dialog):
|
||||
c.set('user_css', unicode(self.css.toPlainText()))
|
||||
c.set('remember_window_size', self.opt_remember_window_size.isChecked())
|
||||
c.set('fit_images', self.opt_fit_images.isChecked())
|
||||
c.set('max_view_width', int(self.max_view_width.value()))
|
||||
c.set('max_fs_width', int(self.max_fs_width.value()))
|
||||
c.set('hyphenate', self.hyphenate.isChecked())
|
||||
c.set('remember_current_page', self.opt_remember_current_page.isChecked())
|
||||
c.set('wheel_flips_pages', self.opt_wheel_flips_pages.isChecked())
|
||||
@ -192,6 +194,8 @@ class Document(QWebPage): # {{{
|
||||
self.loaded_javascript = False
|
||||
self.js_loader = JavaScriptLoader(
|
||||
dynamic_coffeescript=self.debug_javascript)
|
||||
self.initial_left_margin = self.initial_right_margin = u''
|
||||
self.in_fullscreen_mode = False
|
||||
|
||||
self.setLinkDelegationPolicy(self.DelegateAllLinks)
|
||||
self.scroll_marks = []
|
||||
@ -239,6 +243,9 @@ class Document(QWebPage): # {{{
|
||||
self.enable_page_flip = self.page_flip_duration > 0.1
|
||||
self.font_magnification_step = opts.font_magnification_step
|
||||
self.wheel_flips_pages = opts.wheel_flips_pages
|
||||
screen_width = QApplication.desktop().screenGeometry().width()
|
||||
# Leave some space for the scrollbar and some border
|
||||
self.max_fs_width = min(opts.max_fs_width, screen_width-50)
|
||||
|
||||
def fit_images(self):
|
||||
if self.do_fit_images:
|
||||
@ -274,6 +281,30 @@ class Document(QWebPage): # {{{
|
||||
self.set_bottom_padding(0)
|
||||
self.fit_images()
|
||||
self.init_hyphenate()
|
||||
self.initial_left_margin = unicode(self.javascript(
|
||||
'document.body.style.marginLeft').toString())
|
||||
self.initial_right_margin = unicode(self.javascript(
|
||||
'document.body.style.marginRight').toString())
|
||||
if self.in_fullscreen_mode:
|
||||
self.switch_to_fullscreen_mode()
|
||||
|
||||
def switch_to_fullscreen_mode(self):
|
||||
self.in_fullscreen_mode = True
|
||||
self.javascript('''
|
||||
var s = document.body.style;
|
||||
s.maxWidth = "%dpx";
|
||||
s.marginLeft = "auto";
|
||||
s.marginRight = "auto";
|
||||
'''%self.max_fs_width)
|
||||
|
||||
def switch_to_window_mode(self):
|
||||
self.in_fullscreen_mode = False
|
||||
self.javascript('''
|
||||
var s = document.body.style;
|
||||
s.maxWidth = "none";
|
||||
s.marginLeft = "%s";
|
||||
s.marginRight = "%s";
|
||||
'''%(self.initial_left_margin, self.initial_right_margin))
|
||||
|
||||
@pyqtSignature("QString")
|
||||
def debug(self, msg):
|
||||
@ -581,8 +612,8 @@ class DocumentView(QWebView): # {{{
|
||||
|
||||
def config(self, parent=None):
|
||||
self.document.do_config(parent)
|
||||
if self.manager is not None:
|
||||
self.manager.set_max_width()
|
||||
if self.document.in_fullscreen_mode:
|
||||
self.document.switch_to_fullscreen_mode()
|
||||
self.setFocus(Qt.OtherFocusReason)
|
||||
|
||||
def bookmark(self):
|
||||
@ -602,6 +633,9 @@ class DocumentView(QWebView): # {{{
|
||||
menu.insertAction(list(menu.actions())[0], self.search_action)
|
||||
menu.addSeparator()
|
||||
menu.addAction(self.goto_location_action)
|
||||
if self.document.in_fullscreen_mode and self.manager is not None:
|
||||
menu.addSeparator()
|
||||
menu.addAction(self.manager.toggle_toolbar_action)
|
||||
menu.exec_(ev.globalPos())
|
||||
|
||||
def lookup(self, *args):
|
||||
|
@ -5,11 +5,11 @@ import traceback, os, sys, functools, collections, re
|
||||
from functools import partial
|
||||
from threading import Thread
|
||||
|
||||
from PyQt4.Qt import QApplication, Qt, QIcon, QTimer, SIGNAL, QByteArray, \
|
||||
QDoubleSpinBox, QLabel, QTextBrowser, \
|
||||
QPainter, QBrush, QColor, QStandardItemModel, QPalette, \
|
||||
QStandardItem, QUrl, QRegExpValidator, QRegExp, QLineEdit, \
|
||||
QToolButton, QMenu, QInputDialog, QAction, QKeySequence
|
||||
from PyQt4.Qt import (QApplication, Qt, QIcon, QTimer, SIGNAL, QByteArray,
|
||||
QSize, QDoubleSpinBox, QLabel, QTextBrowser, QPropertyAnimation,
|
||||
QPainter, QBrush, QColor, QStandardItemModel, QPalette, QStandardItem,
|
||||
QUrl, QRegExpValidator, QRegExp, QLineEdit, QToolButton, QMenu,
|
||||
QInputDialog, QAction, QKeySequence)
|
||||
|
||||
from calibre.gui2.viewer.main_ui import Ui_EbookViewer
|
||||
from calibre.gui2.viewer.printing import Printing
|
||||
@ -55,8 +55,6 @@ class TOC(QStandardItemModel):
|
||||
self.appendRow(TOCItem(t))
|
||||
self.setHorizontalHeaderItem(0, QStandardItem(_('Table of Contents')))
|
||||
|
||||
|
||||
|
||||
class Worker(Thread):
|
||||
|
||||
def run(self):
|
||||
@ -292,6 +290,37 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
|
||||
self.tool_bar2.setContextMenuPolicy(Qt.PreventContextMenu)
|
||||
self.tool_bar.widgetForAction(self.action_bookmark).setPopupMode(QToolButton.MenuButtonPopup)
|
||||
self.action_full_screen.setCheckable(True)
|
||||
self.full_screen_label = QLabel('''
|
||||
<center>
|
||||
<h1>%s</h1>
|
||||
<h3>%s</h3>
|
||||
<h3>%s</h3>
|
||||
</center>
|
||||
'''%(_('Full screen mode'),
|
||||
_('Right click to show controls'),
|
||||
_('Press Esc to quit')),
|
||||
self)
|
||||
self.full_screen_label.setVisible(False)
|
||||
self.full_screen_label.setStyleSheet('''
|
||||
QLabel {
|
||||
text-align: center;
|
||||
background-color: white;
|
||||
color: black;
|
||||
border-width: 1px;
|
||||
border-style: solid;
|
||||
border-radius: 20px;
|
||||
}
|
||||
''')
|
||||
self.toggle_toolbar_action = QAction(_('Show/hide controls'), self)
|
||||
self.toggle_toolbar_action.triggered.connect(self.toggle_toolbars)
|
||||
self.addAction(self.toggle_toolbar_action)
|
||||
self.full_screen_label_anim = QPropertyAnimation(
|
||||
self.full_screen_label, 'size')
|
||||
self.esc_full_screen_action = a = QAction(self)
|
||||
self.addAction(a)
|
||||
a.setShortcut(Qt.Key_Escape)
|
||||
a.setEnabled(False)
|
||||
a.triggered.connect(self.action_full_screen.trigger)
|
||||
|
||||
self.print_menu = QMenu()
|
||||
self.print_menu.addAction(QIcon(I('print-preview.png')), _('Print Preview'))
|
||||
@ -299,7 +328,6 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
|
||||
self.tool_bar.widgetForAction(self.action_print).setPopupMode(QToolButton.MenuButtonPopup)
|
||||
self.connect(self.action_print, SIGNAL("triggered(bool)"), partial(self.print_book, preview=False))
|
||||
self.connect(self.print_menu.actions()[0], SIGNAL("triggered(bool)"), partial(self.print_book, preview=True))
|
||||
self.set_max_width()
|
||||
ca = self.view.copy_action
|
||||
ca.setShortcut(QKeySequence.Copy)
|
||||
self.addAction(ca)
|
||||
@ -313,6 +341,13 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
|
||||
w = self.tool_bar.widgetForAction(self.action_open_ebook)
|
||||
w.setPopupMode(QToolButton.MenuButtonPopup)
|
||||
|
||||
for x in ('tool_bar', 'tool_bar2'):
|
||||
x = getattr(self, x)
|
||||
for action in x.actions():
|
||||
# So that the keyboard shortcuts for these actions will
|
||||
# continue to function even when the toolbars are hidden
|
||||
self.addAction(action)
|
||||
|
||||
self.restore_state()
|
||||
|
||||
def set_toc_visible(self, yes):
|
||||
@ -338,9 +373,18 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
|
||||
count += 1
|
||||
|
||||
def closeEvent(self, e):
|
||||
if self.isFullScreen():
|
||||
self.action_full_screen.trigger()
|
||||
e.ignore()
|
||||
return
|
||||
self.save_state()
|
||||
return MainWindow.closeEvent(self, e)
|
||||
|
||||
def toggle_toolbars(self):
|
||||
for x in ('tool_bar', 'tool_bar2'):
|
||||
x = getattr(self, x)
|
||||
x.setVisible(not x.isVisible())
|
||||
|
||||
def save_state(self):
|
||||
state = bytearray(self.saveState(self.STATE_VERSION))
|
||||
vprefs['viewer_toolbar_state'] = state
|
||||
@ -382,11 +426,6 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
|
||||
self._lookup = None
|
||||
self.dictionary_view.setHtml(html)
|
||||
|
||||
def set_max_width(self):
|
||||
from calibre.gui2.viewer.documentview import config
|
||||
c = config().parse()
|
||||
self.frame.setMaximumWidth(c.max_view_width)
|
||||
|
||||
def get_remember_current_page_opt(self):
|
||||
from calibre.gui2.viewer.documentview import config
|
||||
c = config().parse()
|
||||
@ -401,6 +440,46 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
|
||||
else:
|
||||
self.showFullScreen()
|
||||
|
||||
def showFullScreen(self):
|
||||
self.tool_bar.setVisible(False)
|
||||
self.tool_bar2.setVisible(False)
|
||||
self._original_frame_margins = (
|
||||
self.centralwidget.layout().contentsMargins(),
|
||||
self.frame.layout().contentsMargins())
|
||||
self.frame.layout().setContentsMargins(0, 0, 0, 0)
|
||||
self.centralwidget.layout().setContentsMargins(0, 0, 0, 0)
|
||||
|
||||
super(EbookViewer, self).showFullScreen()
|
||||
QTimer.singleShot(10, self.show_full_screen_label)
|
||||
|
||||
def show_full_screen_label(self):
|
||||
f = self.full_screen_label
|
||||
self.esc_full_screen_action.setEnabled(True)
|
||||
f.setVisible(True)
|
||||
height = 200
|
||||
width = int(0.7*self.view.width())
|
||||
f.resize(width, height)
|
||||
f.move((self.view.width() - width)//2, (self.view.height()-height)//2)
|
||||
a = self.full_screen_label_anim
|
||||
a.setDuration(500)
|
||||
a.setStartValue(QSize(width, 0))
|
||||
a.setEndValue(QSize(width, height))
|
||||
a.start()
|
||||
QTimer.singleShot(2750, self.full_screen_label.hide)
|
||||
self.view.document.switch_to_fullscreen_mode()
|
||||
|
||||
def showNormal(self):
|
||||
self.esc_full_screen_action.setEnabled(False)
|
||||
self.tool_bar.setVisible(True)
|
||||
self.tool_bar2.setVisible(True)
|
||||
self.full_screen_label.setVisible(False)
|
||||
if hasattr(self, '_original_frame_margins'):
|
||||
om = self._original_frame_margins
|
||||
self.centralwidget.layout().setContentsMargins(om[0])
|
||||
self.frame.layout().setContentsMargins(om[1])
|
||||
super(EbookViewer, self).showNormal()
|
||||
self.view.document.switch_to_window_mode()
|
||||
|
||||
def goto(self, ref):
|
||||
if ref:
|
||||
tokens = ref.split('.')
|
||||
|
@ -284,6 +284,9 @@
|
||||
<property name="text">
|
||||
<string>Toggle full screen</string>
|
||||
</property>
|
||||
<property name="toolTip">
|
||||
<string>Toggle full screen (F11)</string>
|
||||
</property>
|
||||
</action>
|
||||
<action name="action_print">
|
||||
<property name="icon">
|
||||
|
@ -15,6 +15,7 @@ from PyQt4.Qt import (QIcon, QFont, QLabel, QListWidget, QAction,
|
||||
QMenu, QStringListModel, QCompleter, QStringList,
|
||||
QTimer, QRect, QFontDatabase, QGraphicsView)
|
||||
|
||||
from calibre.constants import iswindows
|
||||
from calibre.gui2 import (NONE, error_dialog, pixmap_to_data, gprefs,
|
||||
warning_dialog)
|
||||
from calibre.gui2.filename_pattern_ui import Ui_Form
|
||||
@ -365,7 +366,7 @@ class FontFamilyModel(QAbstractListModel): # {{{
|
||||
self.families = list(qt_families.intersection(set(self.families)))
|
||||
self.families.sort()
|
||||
self.families[:0] = [_('None')]
|
||||
self.font = QFont('sansserif')
|
||||
self.font = QFont('Verdana' if iswindows else 'sansserif')
|
||||
|
||||
def rowCount(self, *args):
|
||||
return len(self.families)
|
||||
|
@ -591,6 +591,21 @@ def educateQuotes(str):
|
||||
str = re.sub(r'''""''', """””""", str)
|
||||
str = re.sub(r"""''""", """’’""", str)
|
||||
|
||||
# Special case for Quotes at inside of other entities, e.g.:
|
||||
# <p>A double quote--"within dashes"--would be nice.</p>
|
||||
str = re.sub(r"""(?<=\W)"(?=\w)""", r"""“""", str)
|
||||
str = re.sub(r"""(?<=\W)'(?=\w)""", r"""‘""", str)
|
||||
str = re.sub(r"""(?<=\w)"(?=\W)""", r"""”""", str)
|
||||
str = re.sub(r"""(?<=\w)'(?=\W)""", r"""’""", str)
|
||||
|
||||
# Special case for Quotes at end of line with a preceeding space (may change just to end of line)
|
||||
str = re.sub(r"""(?<=\s)"$""", r"""”""", str)
|
||||
str = re.sub(r"""(?<=\s)'$""", r"""’""", str)
|
||||
|
||||
# Special case for Quotes at beginning of line with a space - multiparagraph quoted text:
|
||||
str = re.sub(r"""^"(?=\s)""", r"""“""", str)
|
||||
str = re.sub(r"""^'(?=\s)""", r"""‘""", str)
|
||||
|
||||
# Special case for decade abbreviations (the '80s):
|
||||
str = re.sub(r"""\b'(?=\d{2}s)""", r"""’""", str)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user