MOBI Input: Use the viasual formatting of the Table of Contents to try to automatically create a multi-level TOC when converting/viewing MOBI files. Fixes #763681 (Private bug)

This commit is contained in:
Kovid Goyal 2011-04-21 13:40:56 -06:00
parent 549b89c82f
commit bacd84c21d
3 changed files with 112 additions and 42 deletions

View File

@ -7,7 +7,7 @@ Code for the conversion of ebook formats and the reading of metadata
from various formats. from various formats.
''' '''
import traceback, os import traceback, os, re
from calibre import CurrentDir from calibre import CurrentDir
class ConversionError(Exception): class ConversionError(Exception):
@ -169,3 +169,42 @@ def calibre_cover(title, author_string, series_string=None,
lines.append(TextLine(series_string, author_size)) lines.append(TextLine(series_string, author_size))
return create_cover_page(lines, I('library.png'), output_format='jpg') return create_cover_page(lines, I('library.png'), output_format='jpg')
UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc)$')
def unit_convert(value, base, font, dpi):
' Return value in pts'
if isinstance(value, (int, long, float)):
return value
try:
return float(value) * 72.0 / dpi
except:
pass
result = value
m = UNIT_RE.match(value)
if m is not None and m.group(1):
value = float(m.group(1))
unit = m.group(2)
if unit == '%':
result = (value / 100.0) * base
elif unit == 'px':
result = value * 72.0 / dpi
elif unit == 'in':
result = value * 72.0
elif unit == 'pt':
result = value
elif unit == 'em':
result = value * font
elif unit in ('ex', 'en'):
# This is a hack for ex since we have no way to know
# the x-height of the font
font = font
result = value * font * 0.5
elif unit == 'pc':
result = value * 12.0
elif unit == 'mm':
result = value * 0.04
elif unit == 'cm':
result = value * 0.40
return result

View File

@ -20,7 +20,7 @@ from calibre.utils.filenames import ascii_filename
from calibre.utils.date import parse_date from calibre.utils.date import parse_date
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks import DRMError from calibre.ebooks import DRMError, unit_convert
from calibre.ebooks.chardet import ENCODING_PATS from calibre.ebooks.chardet import ENCODING_PATS
from calibre.ebooks.mobi import MobiError from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.huffcdic import HuffReader from calibre.ebooks.mobi.huffcdic import HuffReader
@ -258,6 +258,8 @@ class MobiReader(object):
} }
''') ''')
self.tag_css_rules = {} self.tag_css_rules = {}
self.left_margins = {}
self.text_indents = {}
if hasattr(filename_or_stream, 'read'): if hasattr(filename_or_stream, 'read'):
stream = filename_or_stream stream = filename_or_stream
@ -567,9 +569,21 @@ class MobiReader(object):
elif tag.tag == 'img': elif tag.tag == 'img':
tag.set('width', width) tag.set('width', width)
else: else:
styles.append('text-indent: %s' % self.ensure_unit(width)) ewidth = self.ensure_unit(width)
styles.append('text-indent: %s' % ewidth)
try:
ewidth_val = unit_convert(ewidth, 12, 500, 166)
self.text_indents[tag] = ewidth_val
except:
pass
if width.startswith('-'): if width.startswith('-'):
styles.append('margin-left: %s' % self.ensure_unit(width[1:])) styles.append('margin-left: %s' % self.ensure_unit(width[1:]))
try:
ewidth_val = unit_convert(ewidth[1:], 12, 500, 166)
self.left_margins[tag] = ewidth_val
except:
pass
if attrib.has_key('align'): if attrib.has_key('align'):
align = attrib.pop('align').strip() align = attrib.pop('align').strip()
if align: if align:
@ -661,6 +675,26 @@ class MobiReader(object):
if hasattr(parent, 'remove'): if hasattr(parent, 'remove'):
parent.remove(tag) parent.remove(tag)
def get_left_whitespace(self, tag):
def whitespace(tag):
lm = ti = 0.0
if tag.tag == 'p':
ti = unit_convert('1.5em', 12, 500, 166)
if tag.tag == 'blockquote':
lm = unit_convert('2em', 12, 500, 166)
lm = self.left_margins.get(tag, lm)
ti = self.text_indents.get(tag, ti)
return lm + ti
parent = tag
ans = 0.0
while parent is not None:
ans += whitespace(parent)
parent = parent.getparent()
return ans
def create_opf(self, htmlfile, guide=None, root=None): def create_opf(self, htmlfile, guide=None, root=None):
mi = getattr(self.book_header.exth, 'mi', self.embedded_mi) mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
if mi is None: if mi is None:
@ -731,16 +765,45 @@ class MobiReader(object):
except: except:
text = '' text = ''
text = ent_pat.sub(entity_to_unicode, text) text = ent_pat.sub(entity_to_unicode, text)
tocobj.add_item(toc.partition('#')[0], href[1:], item = tocobj.add_item(toc.partition('#')[0], href[1:],
text) text)
item.left_space = int(self.get_left_whitespace(x))
found = True found = True
if reached and found and x.get('class', None) == 'mbp_pagebreak': if reached and found and x.get('class', None) == 'mbp_pagebreak':
break break
if tocobj is not None: if tocobj is not None:
tocobj = self.structure_toc(tocobj)
opf.set_toc(tocobj) opf.set_toc(tocobj)
return opf, ncx_manifest_entry return opf, ncx_manifest_entry
def structure_toc(self, toc):
indent_vals = set()
for item in toc:
indent_vals.add(item.left_space)
if len(indent_vals) > 6 or len(indent_vals) < 2:
# Too many or too few levels, give up
return toc
indent_vals = sorted(indent_vals)
last_found = [None for i in indent_vals]
newtoc = TOC()
def find_parent(level):
candidates = last_found[:level]
for x in reversed(candidates):
if x is not None:
return x
return newtoc
for item in toc:
level = indent_vals.index(item.left_space)
parent = find_parent(level)
last_found[level] = parent.add_item(item.href, item.fragment,
item.text)
return newtoc
def sizeof_trailing_entries(self, data): def sizeof_trailing_entries(self, data):
def sizeof_trailing_entry(ptr, psize): def sizeof_trailing_entry(ptr, psize):

View File

@ -18,6 +18,7 @@ from cssutils import profile as cssprofiles
from lxml import etree from lxml import etree
from lxml.cssselect import css_to_xpath, ExpressionError, SelectorSyntaxError from lxml.cssselect import css_to_xpath, ExpressionError, SelectorSyntaxError
from calibre import force_unicode from calibre import force_unicode
from calibre.ebooks import unit_convert
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES
from calibre.ebooks.oeb.base import XPNSMAP, xpath, urlnormalize from calibre.ebooks.oeb.base import XPNSMAP, xpath, urlnormalize
from calibre.ebooks.oeb.profile import PROFILES from calibre.ebooks.oeb.profile import PROFILES
@ -444,7 +445,6 @@ class Stylizer(object):
class Style(object): class Style(object):
UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc)$')
MS_PAT = re.compile(r'^\s*(mso-|panose-|text-underline|tab-interval)') MS_PAT = re.compile(r'^\s*(mso-|panose-|text-underline|tab-interval)')
def __init__(self, element, stylizer): def __init__(self, element, stylizer):
@ -507,43 +507,11 @@ class Style(object):
return result return result
def _unit_convert(self, value, base=None, font=None): def _unit_convert(self, value, base=None, font=None):
' Return value in pts' 'Return value in pts'
if isinstance(value, (int, long, float)): if base is None:
return value base = self.width
try: font = font or self.fontSize
return float(value) * 72.0 / self._profile.dpi return unit_convert(value, base, font, self._profile.dpi)
except:
pass
result = value
m = self.UNIT_RE.match(value)
if m is not None and m.group(1):
value = float(m.group(1))
unit = m.group(2)
if unit == '%':
if base is None:
base = self.width
result = (value / 100.0) * base
elif unit == 'px':
result = value * 72.0 / self._profile.dpi
elif unit == 'in':
result = value * 72.0
elif unit == 'pt':
result = value
elif unit == 'em':
font = font or self.fontSize
result = value * font
elif unit in ('ex', 'en'):
# This is a hack for ex since we have no way to know
# the x-height of the font
font = font or self.fontSize
result = value * font * 0.5
elif unit == 'pc':
result = value * 12.0
elif unit == 'mm':
result = value * 0.04
elif unit == 'cm':
result = value * 0.40
return result
def pt_to_px(self, value): def pt_to_px(self, value):
return (self._profile.dpi / 72.0) * value return (self._profile.dpi / 72.0) * value