mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-11-22 14:33:02 -05:00
200 lines
6.6 KiB
Python
200 lines
6.6 KiB
Python
#!/usr/bin/env python
|
|
# vim:fileencoding=utf-8
|
|
from __future__ import (unicode_literals, division, absolute_import,
|
|
print_function)
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
|
|
import os
|
|
|
|
from calibre.ebooks.docx.names import XPath
|
|
NBSP = '\xa0'
|
|
|
|
def mergeable(previous, current):
|
|
if previous.tail or current.tail:
|
|
return False
|
|
if previous.get('class', None) != current.get('class', None):
|
|
return False
|
|
if current.get('id', False):
|
|
return False
|
|
try:
|
|
return next(previous.itersiblings()) is current
|
|
except StopIteration:
|
|
return False
|
|
|
|
|
|
def append_text(parent, text):
|
|
if len(parent) > 0:
|
|
parent[-1].tail = (parent[-1].tail or '') + text
|
|
else:
|
|
parent.text = (parent.text or '') + text
|
|
|
|
|
|
def merge(parent, span):
|
|
if span.text:
|
|
append_text(parent, span.text)
|
|
for child in span:
|
|
parent.append(child)
|
|
if span.tail:
|
|
append_text(parent, span.tail)
|
|
span.getparent().remove(span)
|
|
|
|
|
|
def merge_run(run):
|
|
parent = run[0]
|
|
for span in run[1:]:
|
|
merge(parent, span)
|
|
|
|
|
|
def liftable(css):
|
|
# A <span> is liftable if all its styling would work just as well if it is
|
|
# specified on the parent element.
|
|
prefixes = {x.partition('-')[0] for x in css.iterkeys()}
|
|
return not (prefixes - {'text', 'font', 'letter', 'color', 'background'})
|
|
|
|
|
|
def add_text(elem, attr, text):
|
|
old = getattr(elem, attr) or ''
|
|
setattr(elem, attr, old + text)
|
|
|
|
|
|
def lift(span):
|
|
# Replace an element by its content (text, children and tail)
|
|
parent = span.getparent()
|
|
idx = parent.index(span)
|
|
try:
|
|
last_child = span[-1]
|
|
except IndexError:
|
|
last_child = None
|
|
|
|
if span.text:
|
|
if idx == 0:
|
|
add_text(parent, 'text', span.text)
|
|
else:
|
|
add_text(parent[idx - 1], 'tail', span.text)
|
|
|
|
for child in reversed(span):
|
|
parent.insert(idx, child)
|
|
parent.remove(span)
|
|
|
|
if span.tail:
|
|
if last_child is None:
|
|
if idx == 0:
|
|
add_text(parent, 'text', span.tail)
|
|
else:
|
|
add_text(parent[idx - 1], 'tail', span.tail)
|
|
else:
|
|
add_text(last_child, 'tail', span.tail)
|
|
|
|
def before_count(root, tag, limit=10):
|
|
body = root.xpath('//body[1]')
|
|
if not body:
|
|
return limit
|
|
ans = 0
|
|
for elem in body[0].iterdescendants():
|
|
if elem is tag:
|
|
return ans
|
|
ans += 1
|
|
if ans > limit:
|
|
return limit
|
|
|
|
def cleanup_markup(log, root, styles, dest_dir, detect_cover):
|
|
# Move <hr>s outside paragraphs, if possible.
|
|
pancestor = XPath('|'.join('ancestor::%s[1]' % x for x in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6')))
|
|
for hr in root.xpath('//span/hr'):
|
|
p = pancestor(hr)
|
|
if p:
|
|
p = p[0]
|
|
descendants = tuple(p.iterdescendants())
|
|
if descendants[-1] is hr:
|
|
parent = p.getparent()
|
|
idx = parent.index(p)
|
|
parent.insert(idx+1, hr)
|
|
hr.tail = '\n\t'
|
|
|
|
# Merge consecutive spans that have the same styling
|
|
current_run = []
|
|
for span in root.xpath('//span[not(@style or @lang)]'):
|
|
if not current_run:
|
|
current_run.append(span)
|
|
else:
|
|
last = current_run[-1]
|
|
if mergeable(last, span):
|
|
current_run.append(span)
|
|
else:
|
|
if len(current_run) > 1:
|
|
merge_run(current_run)
|
|
current_run = [span]
|
|
|
|
# Remove unnecessary span tags that are the only child of a parent block
|
|
# element
|
|
class_map = dict(styles.classes.itervalues())
|
|
parents = ('p', 'div') + tuple('h%d' % i for i in xrange(1, 7))
|
|
for parent in root.xpath('//*[(%s) and count(span)=1]' % ' or '.join('name()="%s"' % t for t in parents)):
|
|
if len(parent) == 1 and not parent.text and not parent[0].tail and not parent[0].get('id', None):
|
|
# We have a block whose contents are entirely enclosed in a <span>
|
|
span = parent[0]
|
|
span_class = span.get('class', None)
|
|
span_css = class_map.get(span_class, {})
|
|
if liftable(span_css):
|
|
pclass = parent.get('class', None)
|
|
if span_class:
|
|
pclass = (pclass + ' ' + span_class) if pclass else span_class
|
|
parent.set('class', pclass)
|
|
parent.text = span.text
|
|
parent.remove(span)
|
|
if span.get('lang'):
|
|
parent.set('lang', span.get('lang'))
|
|
for child in span:
|
|
parent.append(child)
|
|
|
|
# Make spans whose only styling is bold or italic into <b> and <i> tags
|
|
for span in root.xpath('//span[@class and not(@style)]'):
|
|
css = class_map.get(span.get('class', None), {})
|
|
if len(css) == 1:
|
|
if css == {'font-style':'italic'}:
|
|
span.tag = 'i'
|
|
del span.attrib['class']
|
|
elif css == {'font-weight':'bold'}:
|
|
span.tag = 'b'
|
|
del span.attrib['class']
|
|
|
|
# Get rid of <span>s that have no styling
|
|
for span in root.xpath('//span[not(@class or @id or @style or @lang)]'):
|
|
lift(span)
|
|
|
|
# Convert <p><br style="page-break-after:always"> </p> style page breaks
|
|
# into something the viewer will render as a page break
|
|
for p in root.xpath('//p[br[@style="page-break-after:always"]]'):
|
|
if len(p) == 1 and (not p[0].tail or not p[0].tail.strip()):
|
|
p.remove(p[0])
|
|
prefix = p.get('style', '')
|
|
if prefix:
|
|
prefix += '; '
|
|
p.set('style', prefix + 'page-break-after:always')
|
|
p.text = NBSP if not p.text else p.text
|
|
|
|
if detect_cover:
|
|
# Check if the first image in the document is possibly a cover
|
|
img = root.xpath('//img[@src][1]')
|
|
if img:
|
|
img = img[0]
|
|
path = os.path.join(dest_dir, img.get('src'))
|
|
if os.path.exists(path) and before_count(root, img, limit=10) < 5:
|
|
from calibre.utils.magick.draw import identify
|
|
try:
|
|
width, height, fmt = identify(path)
|
|
except:
|
|
width, height, fmt = 0, 0, None
|
|
del fmt
|
|
try:
|
|
is_cover = 0.8 <= height/width <= 1.8 and height*width >= 160000
|
|
except ZeroDivisionError:
|
|
is_cover = False
|
|
if is_cover:
|
|
log.debug('Detected an image that looks like a cover')
|
|
img.getparent().remove(img)
|
|
return path
|
|
|