MOBI Input: Use the viasual formatting of the Table of Contents to try to automatically create a multi-level TOC when converting/viewing MOBI files. Fixes #763681 (Private bug)

This commit is contained in:
Kovid Goyal 2011-04-21 13:40:56 -06:00
parent 549b89c82f
commit bacd84c21d
3 changed files with 112 additions and 42 deletions

View File

@ -7,7 +7,7 @@ Code for the conversion of ebook formats and the reading of metadata
from various formats.
'''
import traceback, os
import traceback, os, re
from calibre import CurrentDir
class ConversionError(Exception):
@ -169,3 +169,42 @@ def calibre_cover(title, author_string, series_string=None,
lines.append(TextLine(series_string, author_size))
return create_cover_page(lines, I('library.png'), output_format='jpg')
UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc)$')
def unit_convert(value, base, font, dpi):
' Return value in pts'
if isinstance(value, (int, long, float)):
return value
try:
return float(value) * 72.0 / dpi
except:
pass
result = value
m = UNIT_RE.match(value)
if m is not None and m.group(1):
value = float(m.group(1))
unit = m.group(2)
if unit == '%':
result = (value / 100.0) * base
elif unit == 'px':
result = value * 72.0 / dpi
elif unit == 'in':
result = value * 72.0
elif unit == 'pt':
result = value
elif unit == 'em':
result = value * font
elif unit in ('ex', 'en'):
# This is a hack for ex since we have no way to know
# the x-height of the font
font = font
result = value * font * 0.5
elif unit == 'pc':
result = value * 12.0
elif unit == 'mm':
result = value * 0.04
elif unit == 'cm':
result = value * 0.40
return result

View File

@ -20,7 +20,7 @@ from calibre.utils.filenames import ascii_filename
from calibre.utils.date import parse_date
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks import DRMError
from calibre.ebooks import DRMError, unit_convert
from calibre.ebooks.chardet import ENCODING_PATS
from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.huffcdic import HuffReader
@ -258,6 +258,8 @@ class MobiReader(object):
}
''')
self.tag_css_rules = {}
self.left_margins = {}
self.text_indents = {}
if hasattr(filename_or_stream, 'read'):
stream = filename_or_stream
@ -567,9 +569,21 @@ class MobiReader(object):
elif tag.tag == 'img':
tag.set('width', width)
else:
styles.append('text-indent: %s' % self.ensure_unit(width))
ewidth = self.ensure_unit(width)
styles.append('text-indent: %s' % ewidth)
try:
ewidth_val = unit_convert(ewidth, 12, 500, 166)
self.text_indents[tag] = ewidth_val
except:
pass
if width.startswith('-'):
styles.append('margin-left: %s' % self.ensure_unit(width[1:]))
try:
ewidth_val = unit_convert(ewidth[1:], 12, 500, 166)
self.left_margins[tag] = ewidth_val
except:
pass
if attrib.has_key('align'):
align = attrib.pop('align').strip()
if align:
@ -661,6 +675,26 @@ class MobiReader(object):
if hasattr(parent, 'remove'):
parent.remove(tag)
def get_left_whitespace(self, tag):
def whitespace(tag):
lm = ti = 0.0
if tag.tag == 'p':
ti = unit_convert('1.5em', 12, 500, 166)
if tag.tag == 'blockquote':
lm = unit_convert('2em', 12, 500, 166)
lm = self.left_margins.get(tag, lm)
ti = self.text_indents.get(tag, ti)
return lm + ti
parent = tag
ans = 0.0
while parent is not None:
ans += whitespace(parent)
parent = parent.getparent()
return ans
def create_opf(self, htmlfile, guide=None, root=None):
mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
if mi is None:
@ -731,16 +765,45 @@ class MobiReader(object):
except:
text = ''
text = ent_pat.sub(entity_to_unicode, text)
tocobj.add_item(toc.partition('#')[0], href[1:],
item = tocobj.add_item(toc.partition('#')[0], href[1:],
text)
item.left_space = int(self.get_left_whitespace(x))
found = True
if reached and found and x.get('class', None) == 'mbp_pagebreak':
break
if tocobj is not None:
tocobj = self.structure_toc(tocobj)
opf.set_toc(tocobj)
return opf, ncx_manifest_entry
def structure_toc(self, toc):
indent_vals = set()
for item in toc:
indent_vals.add(item.left_space)
if len(indent_vals) > 6 or len(indent_vals) < 2:
# Too many or too few levels, give up
return toc
indent_vals = sorted(indent_vals)
last_found = [None for i in indent_vals]
newtoc = TOC()
def find_parent(level):
candidates = last_found[:level]
for x in reversed(candidates):
if x is not None:
return x
return newtoc
for item in toc:
level = indent_vals.index(item.left_space)
parent = find_parent(level)
last_found[level] = parent.add_item(item.href, item.fragment,
item.text)
return newtoc
def sizeof_trailing_entries(self, data):
def sizeof_trailing_entry(ptr, psize):

View File

@ -18,6 +18,7 @@ from cssutils import profile as cssprofiles
from lxml import etree
from lxml.cssselect import css_to_xpath, ExpressionError, SelectorSyntaxError
from calibre import force_unicode
from calibre.ebooks import unit_convert
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES
from calibre.ebooks.oeb.base import XPNSMAP, xpath, urlnormalize
from calibre.ebooks.oeb.profile import PROFILES
@ -444,7 +445,6 @@ class Stylizer(object):
class Style(object):
UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc)$')
MS_PAT = re.compile(r'^\s*(mso-|panose-|text-underline|tab-interval)')
def __init__(self, element, stylizer):
@ -507,43 +507,11 @@ class Style(object):
return result
def _unit_convert(self, value, base=None, font=None):
' Return value in pts'
if isinstance(value, (int, long, float)):
return value
try:
return float(value) * 72.0 / self._profile.dpi
except:
pass
result = value
m = self.UNIT_RE.match(value)
if m is not None and m.group(1):
value = float(m.group(1))
unit = m.group(2)
if unit == '%':
if base is None:
base = self.width
result = (value / 100.0) * base
elif unit == 'px':
result = value * 72.0 / self._profile.dpi
elif unit == 'in':
result = value * 72.0
elif unit == 'pt':
result = value
elif unit == 'em':
font = font or self.fontSize
result = value * font
elif unit in ('ex', 'en'):
# This is a hack for ex since we have no way to know
# the x-height of the font
font = font or self.fontSize
result = value * font * 0.5
elif unit == 'pc':
result = value * 12.0
elif unit == 'mm':
result = value * 0.04
elif unit == 'cm':
result = value * 0.40
return result
'Return value in pts'
if base is None:
base = self.width
font = font or self.fontSize
return unit_convert(value, base, font, self._profile.dpi)
def pt_to_px(self, value):
return (self._profile.dpi / 72.0) * value