Initial (not-working) implementation of MOBI Indexing

This commit is contained in:
Kovid Goyal 2009-05-30 12:51:05 -07:00
parent cdfdb1dab4
commit 34f3d47d52
8 changed files with 249 additions and 177 deletions

View File

@ -72,6 +72,9 @@ if __name__ == '__main__':
library_dirs=[os.environ.get('PODOFO_LIB_DIR', podofo_lib)],
include_dirs=\
[os.environ.get('PODOFO_INC_DIR', podofo_inc)]))
else:
print 'WARNING: PoDoFo not found on your system. Various PDF related',
print 'functionality will not work.'
ext_modules = optional + [

View File

@ -27,6 +27,15 @@ class MOBIOutput(OutputFormatPlugin):
OptionRecommendation(name='toc_title', recommended_value=None,
help=_('Title for any generated in-line table of contents.')
),
OptionRecommendation(name='mobi_periodical',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('When present, generate a periodical rather than a book.')
),
OptionRecommendation(name='no_mobi_index',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Disable generation of MOBI index.')
),
])
recommendations = set([
@ -49,7 +58,7 @@ class MOBIOutput(OutputFormatPlugin):
rasterizer(oeb, opts)
mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
mobimlizer(oeb, opts)
writer = MobiWriter(imagemax=imagemax,
writer = MobiWriter(opts, imagemax=imagemax,
prefer_author_sort=opts.prefer_author_sort)
writer(oeb, output_path)

View File

@ -3,7 +3,8 @@ Write content to Mobipocket books.
'''
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.cam>'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.cam> and \
Kovid Goyal <kovid@kovidgoyal.net>'
from collections import defaultdict
from itertools import count
@ -57,6 +58,25 @@ OTHER_MAX_IMAGE_SIZE = 10 * 1024 * 1024
MAX_THUMB_SIZE = 16 * 1024
MAX_THUMB_DIMEN = (180, 240)
TAGX = {
'chapter' :
'\x00\x00\x00\x01\x01\x01\x01\x00\x02\x01\x02\x00\x03\x01\x04\x00\x04\x01\x08\x00\x00\x00\x00\x01',
'subchapter' :
'\x00\x00\x00\x01\x01\x01\x01\x00\x02\x01\x02\x00\x03\x01\x04\x00\x04\x01\x08\x00\x05\x01\x10\x00\x15\x01\x10\x00\x16\x01\x20\x00\x17\x01\x40\x00\x00\x00\x00\x01',
'periodical' :
'\x00\x00\x00\x02\x01\x01\x01\x00\x02\x01\x02\x00\x03\x01\x04\x00\x04\x01\x08\x00\x05\x01\x10\x00\x15\x01\x20\x00\x16\x01\x40\x00\x17\x01\x80\x00\x00\x00\x00\x01\x45\x01\x01\x00\x46\x01\x02\x00\x47\x01\x04\x00\x00\x00\x00\x01'
}
INDXT = {
'chapter' : '\x0f',
'subchapter' : '\x1f',
'article' : '\x3f',
'chapter with subchapters': '\x6f',
'periodical' : '\xdf',
'section' : '\xff',
}
def encode(data):
return data.encode('utf-8')
@ -202,13 +222,11 @@ class Serializer(object):
def serialize_item(self, item):
buffer = self.buffer
#buffer.write('<mbp:section>')
if not item.linear:
self.breaks.append(buffer.tell() - 1)
self.id_offsets[item.href] = buffer.tell()
for elem in item.data.find(XHTML('body')):
self.serialize_elem(elem, item)
#buffer.write('</mbp:section>')
buffer.write('<mbp:pagebreak/>')
def serialize_elem(self, elem, item, nsrmap=NSRMAP):
@ -288,11 +306,13 @@ class Serializer(object):
class MobiWriter(object):
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
def __init__(self, compression=PALMDOC, imagemax=None,
def __init__(self, opts, compression=PALMDOC, imagemax=None,
prefer_author_sort=False):
self.opts = opts
self._compression = compression or UNCOMPRESSED
self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE
self._prefer_author_sort = prefer_author_sort
self._primary_index_record = None
@classmethod
def generate(cls, opts):
@ -327,6 +347,8 @@ class MobiWriter(object):
def _generate_content(self):
self._map_image_names()
self._generate_text()
if not self.opts.no_mobi_index:
self._generate_index()
self._generate_images()
def _map_image_names(self):
@ -372,6 +394,8 @@ class MobiWriter(object):
serializer = Serializer(self._oeb, self._images)
breaks = serializer.breaks
text = serializer.text
self._id_offsets = serializer.id_offsets
self._content_length = len(text)
self._text_length = len(text)
text = StringIO(text)
nrecords = 0
@ -408,10 +432,205 @@ class MobiWriter(object):
data, overlap = self._read_text_record(text)
self._text_nrecords = nrecords
def _generate_indxt(self, ctoc):
if self.opts.mobi_periodical:
raise NotImplementedError('Indexing for periodicals not implemented')
toc = self._oeb.toc
indxt, indices, c = StringIO(), StringIO(), 0
indices.write('INDX')
c = 0
last_index = last_name = None
def add_node(node, offset, length, count):
t = node.title
if self.opts.verbose > 2:
self._oeb.log.debug('Adding TOC node:', node.title, 'href:',
node.href)
pos = 0xc0 + indxt.tell()
indices.write(pack('>H', pos))
indxt.write(chr(len(str(count)))+str(count))
indxt.write(INDXT['chapter'])
indxt.write(decint(offset, DECINT_FORWARD))
indxt.write(decint(length, DECINT_FORWARD))
indxt.write(decint(self._ctoc_map[node], DECINT_FORWARD))
indxt.write(decint(0, DECINT_FORWARD))
entries = list(toc.iter())[1:]
for i, child in enumerate(entries):
if not child.title or not child.title.strip():
continue
h = child.href
if h not in self._id_offsets:
self._oeb.log.warning('Could not find TOC entry:', child.title)
continue
offset = self._id_offsets[h]
length = None
for sibling in entries[i+1:]:
h2 = sibling.href
if h2 in self._id_offsets:
offset2 = self._id_offsets[h2]
if offset2 > offset:
length = offset2 - offset
break
if length is None:
length = self._content_length - offset
add_node(child, offset, length, c)
last_index = c
ctoc_offset = self._ctoc_map[child]
last_name = self._ctoc_name_map[child]
c += 1
return indxt.getvalue(), c, indices.getvalue(), last_index, last_name
def _generate_index(self):
self._oeb.log('Generating index...')
self._primary_index_record = None
ctoc = self._generate_ctoc()
indxt, indxt_count, indices, last_index, last_name = \
self._generate_indxt(ctoc)
indx1 = StringIO()
indx1.write('INDX'+pack('>I', 0xc0)) # header length
# 0x8 - 0xb : Unknown
indx1.write('\0'*4)
# 0xc - 0xf : Header type
indx1.write(pack('>I', 1))
# 0x10 - 0x13 : Unknown
indx1.write('\0'*4)
# 0x14 - 0x17 : IDXT offset
# 0x18 - 0x1b : IDXT count
indx1.write(pack('>I', 0xc0+len(indxt)))
indx1.write(pack('>I', indxt_count))
# 0x1c - 0x23 : Unknown
indx1.write('\xff'*8)
# 0x24 - 0xbf
indx1.write('\0'*156)
indx1.write(indxt)
indx1.write(indices)
indx1 = indx1.getvalue()
idxt0 = last_name + pack('>H', last_index)
indx0 = StringIO()
tagx = TAGX['periodical' if self.opts.mobi_periodical else 'chapter']
tagx = 'TAGX' + pack('>I', 8 + len(tagx)) + tagx
indx0_indices_pos = 0xc0 + len(tagx) + len(idxt0)
indx0_indices = 'INDX' + pack('>H', 0xc0 + len(tagx))
# Generate record header
header = StringIO()
header.write('INDX')
header.write(pack('>I', 0xc0)) # header length
# 0x08 - 0x0b : Unknown
header.write('\0'*4)
# 0x0c - 0x0f : Header type
header.write(pack('>I', 0))
# 0x10 - 0x13 : Generator ID
header.write(pack('>I', 6))
# 0x14 - 0x17 : IDXT offset
header.write(pack('>I', indx0_indices_pos))
# 0x18 - 0x1b : IDXT count
header.write(pack('>I', 1))
# 0x1c - 0x1f : Text encoding ?
header.write(pack('>I', 650001))
# 0x20 - 0x23 : Language code?
header.write(iana2mobi(str(self._oeb.metadata.language[0])))
# 0x24 - 0x27 : Number of TOC entries in INDX1
header.write(pack('>I', indxt_count))
# 0x28 - 0x2b : ORDT Offset
header.write('\0'*4)
# 0x2c - 0x2f : LIGT offset
header.write('\0'*4)
# 0x30 - 0x33 : Number of LIGT entries
header.write('\0'*4)
# 0x34 - 0x37 : Unknown
header.write(pack('>I', 1))
# 0x38 - 0xb3 : Unknown (pad?)
header.write('\0'*124)
# 0xb4 - 0xb7 : TAGX offset
header.write(pack('>I', 0xc0))
# 0xb8 - 0xbf : Unknown
header.write('\0'*8)
header = header.getvalue()
indx0.write(header)
indx0.write(tagx)
indx0.write(idxt0)
indx0.write(indx0_indices)
indx0 = indx0.getvalue()
self._primary_index_record = len(self._records)
if self.opts.verbose > 3:
from tempfile import mkdtemp
import os
t = mkdtemp()
open(os.path.join(t, 'indx0.bin'), 'wb').write(indx0)
open(os.path.join(t, 'indx1.bin'), 'wb').write(indx1)
open(os.path.join(t, 'ctoc.bin'), 'wb').write(ctoc)
self._oeb.log.debug('Index records dumped to', t)
self._records.extend([indx0, indx1, ctoc])
def _generate_ctoc(self):
if self.opts.mobi_periodical:
raise NotImplementedError('Indexing for periodicals not implemented')
toc = self._oeb.toc
self._ctoc_map = {}
self._ctoc_name_map = {}
self._last_toc_entry = None
ctoc = StringIO()
def add_node(node, cls):
t = node.title
if t and t.strip():
t = t.strip()
if not isinstance(t, unicode):
t = t.decode('utf-8', 'replace')
t = t.encode('utf-8')
self._last_toc_entry = t
self._ctoc_map[node] = ctoc.tell()
self._ctoc_name_map[node] = decint(len(t), DECINT_FORWARD)+t
ctoc.write(self._ctoc_name_map[node])
for child in toc.iter():
add_node(child, 'chapter')
return ctoc.getvalue()
def _generate_images(self):
self._oeb.logger.info('Serializing images...')
images = [(index, href) for href, index in self._images.items()]
images.sort()
self._first_image_record = None
for _, href in images:
item = self._oeb.manifest.hrefs[href]
try:
@ -420,6 +639,8 @@ class MobiWriter(object):
self._oeb.logger.warn('Bad image file %r' % item.href)
continue
self._records.append(data)
if self._first_image_record is None:
self._first_image_record = len(self._records)-1
def _generate_record0(self):
metadata = self._oeb.metadata
@ -446,8 +667,9 @@ class MobiWriter(object):
# 0xC - 0xF : Text encoding (65001 is utf-8)
# 0x10 - 0x13 : UID
# 0x14 - 0x17 : Generator version
btype = 0x101 if self.opts.mobi_periodical else 2
record0.write(pack('>IIIII',
0xe8, 2, 65001, uid, 6))
0xe8, btype, 65001, uid, 6))
# 0x18 - 0x1f : Unknown
record0.write('\xff' * 8)
@ -477,7 +699,7 @@ class MobiWriter(object):
# 0x58 - 0x5b : Format version
# 0x5c - 0x5f : First image record number
record0.write(pack('>II',
6, self._text_nrecords + 1))
6, self._first_image_record if self._first_image_record else 0))
# 0x60 - 0x63 : First HUFF/CDIC record number
# 0x64 - 0x67 : Number of HUFF/CDIC records
@ -537,8 +759,8 @@ class MobiWriter(object):
record0.write(pack('>I', 5))
# 0xe4 - 0xe7 : Primary index record
# TODO: Implement
record0.write(pack('>I', 0xffffffff))
record0.write(pack('>I', 0xffffffff if self._primary_index_record is
None else self._primary_index_record))
record0.write(exth)
record0.write(title)

View File

@ -30,7 +30,7 @@ STYLE_CSS = {
margin-left: 3.6em;
}
""",
'centered': """
.calibre_toc_header {
text-align: center;
@ -48,18 +48,18 @@ class HTMLTOCAdder(object):
def __init__(self, title=None, style='nested'):
self.title = title
self.style = style
@classmethod
def config(cls, cfg):
group = cfg.add_group('htmltoc', _('HTML TOC generation options.'))
group('toc_title', ['--toc-title'], default=None,
group('toc_title', ['--toc-title'], default=None,
help=_('Title for any generated in-line table of contents.'))
return cfg
@classmethod
def generate(cls, opts):
return cls(title=opts.toc_title)
def __call__(self, oeb, context):
if 'toc' in oeb.guide:
return

View File

@ -20,7 +20,6 @@ entry_points = {
'ebook-convert = calibre.ebooks.conversion.cli:main',
'markdown-calibre = calibre.ebooks.markdown.markdown:main',
'web2disk = calibre.web.fetch.simple:main',
'feeds2disk = calibre.web.feeds.main:main',
'calibre-server = calibre.library.server:main',
'lrf2lrs = calibre.ebooks.lrf.lrfparser:main',
'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main',

View File

@ -53,7 +53,7 @@ def get_opts_from_parser(parser, prefix):
for x in do_opt(o): yield x
def send(ans):
pat = re.compile('([^0-9a-zA-Z_./])')
pat = re.compile('([^0-9a-zA-Z_./-])')
for x in sorted(set(ans)):
x = pat.sub(lambda m : '\\'+m.group(1), x)
if x.endswith('\\ '):

View File

@ -1,161 +0,0 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
CLI for downloading feeds.
'''
import sys, os
from calibre.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
from calibre.web.fetch.simple import option_parser as _option_parser
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.config import Config, StringConfig
def config(defaults=None):
desc = _('Options to control the fetching of periodical content from the web.')
c = Config('feeds2disk', desc) if defaults is None else StringConfig(defaults, desc)
web2disk = c.add_group('web2disk', _('Customize the download engine'))
web2disk('timeout', ['-t', '--timeout'], default=10.0,
help=_('Timeout in seconds to wait for a response from the server. Default: %default s'),)
web2disk('delay', ['--delay'], default=0,
help=_('Minimum interval in seconds between consecutive fetches. Default is %default s'))
web2disk('encoding', ['--encoding'], default=None,
help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.'))
web2disk('match_regexps', ['--match-regexp'], default=[], action='append',
help=_('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.'))
web2disk('filter_regexps', ['--filter-regexp'], default=[], action='append',
help=_('Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.'))
web2disk('no_stylesheets', ['--dont-download-stylesheets'], action='store_true', default=False,
help=_('Do not download CSS stylesheets.'))
c.add_opt('feeds', ['--feeds'], default=None,
help=_('''Specify a list of feeds to download. For example:
"['http://feeds.newsweek.com/newsweek/TopNews', 'http://feeds.newsweek.com/headlines/politics']"
If you specify this option, any argument to %prog is ignored and a default recipe is used to download the feeds.'''))
c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count',
help=_('''Be more verbose while processing.'''))
c.add_opt('title', ['--title'], default=None,
help=_('The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.'))
c.add_opt('username', ['-u', '--username'], default=None,
help=_('Username for sites that require a login to access content.'))
c.add_opt('password', ['-p', '--password'], default=None,
help=_('Password for sites that require a login to access content.'))
c.add_opt('lrf', ['--lrf'], default=False, action='store_true',
help='Optimize fetching for subsequent conversion to LRF.')
c.add_opt('epub', ['--epub'], default=False, action='store_true',
help='Optimize fetching for subsequent conversion to EPUB.')
c.add_opt('mobi', ['--mobi'], default=False, action='store_true',
help='Optimize fetching for subsequent conversion to MOBI.')
c.add_opt('recursions', ['--recursions'], default=0,
help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default'))
c.add_opt('output_dir', ['--output-dir'], default='.',
help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.'))
c.add_opt('no_progress_bar', ['--no-progress-bar'], default=False, action='store_true',
help=_("Don't show the progress bar"))
c.add_opt('debug', ['--debug'], action='store_true', default=False,
help=_('Very verbose output, useful for debugging.'))
c.add_opt('test', ['--test'], action='store_true', default=False,
help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.'))
return c
USAGE=_('''\
%%prog [options] ARG
%%prog parses an online source of articles, like an RSS or ATOM feed and
fetches the article contents organized in a nice hierarchy.
ARG can be one of:
file name - %%prog will try to load a recipe from the file
builtin recipe title - %%prog will load the builtin recipe and use it to fetch the feed. For e.g. Newsweek or "The BBC" or "The New York Times"
recipe as a string - %%prog will load the recipe directly from the string arg.
Available builtin recipes are:
%s
''')%(unicode(list(titles))[1:-1])
def option_parser(usage=USAGE):
p = _option_parser(usage=usage)
p.remove_option('--max-recursions')
p.remove_option('--base-dir')
p.remove_option('--verbose')
p.remove_option('--max-files')
p.subsume('WEB2DISK OPTIONS', _('Options to control web2disk (used to fetch websites linked from feeds)'))
p.add_option('--feeds', default=None,
help=_('''Specify a list of feeds to download. For example:
"['http://feeds.newsweek.com/newsweek/TopNews', 'http://feeds.newsweek.com/headlines/politics']"
If you specify this option, any argument to %prog is ignored and a default recipe is used to download the feeds.'''))
p.add_option('--verbose', default=False, action='store_true',
help=_('''Be more verbose while processing.'''))
p.add_option('--title', default=None,
help=_('The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.'))
p.add_option('--username', default=None, help=_('Username for sites that require a login to access content.'))
p.add_option('--password', default=None, help=_('Password for sites that require a login to access content.'))
p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.')
p.add_option('--recursions', default=0, type='int',
help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default'))
p.add_option('--output-dir', default=os.getcwd(),
help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.'))
p.add_option('--no-progress-bar', dest='no_progress_bar', default=False, action='store_true',
help=_('Dont show the progress bar'))
p.add_option('--debug', action='store_true', default=False,
help=_('Very verbose output, useful for debugging.'))
p.add_option('--test', action='store_true', default=False,
help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.'))
return p
class RecipeError(Exception):
pass
def run_recipe(opts, recipe_arg, parser, notification=None):
if notification is None:
from calibre.utils.terminfo import TerminalController, ProgressBar
term = TerminalController(sys.stdout)
pb = ProgressBar(term, _('Fetching feeds...'), no_progress_bar=opts.no_progress_bar)
notification = pb.update
recipe = None
if opts.feeds is not None:
recipe = BasicNewsRecipe
else:
try:
if os.access(recipe_arg, os.R_OK):
recipe = compile_recipe(open(recipe_arg).read())
else:
raise Exception('not file')
except:
recipe = get_builtin_recipe(recipe_arg)
if recipe is None:
recipe = compile_recipe(recipe_arg)
if recipe is None:
raise RecipeError(recipe_arg+ ' is an invalid recipe')
recipe = recipe(opts, parser, notification)
if not os.path.exists(recipe.output_dir):
os.makedirs(recipe.output_dir)
recipe.download(for_lrf=True)
return recipe
def main(args=sys.argv, notification=None):
p = option_parser()
opts, args = p.parse_args(args=args[1:])
if len(args) != 1 and opts.feeds is None:
p.print_help()
return 1
recipe_arg = args[0] if len(args) > 0 else None
run_recipe(opts, recipe_arg, p, notification=notification)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -109,7 +109,7 @@ class Newsweek(BasicNewsRecipe):
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup(self.INDEX)
soup = self.index_to_soup('http://www.newsweek.com')
link_item = soup.find('div',attrs={'class':'cover-image'})
if link_item and link_item.a and link_item.a.img:
cover_url = link_item.a.img['src']