mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Removed accidentally added file.
This commit is contained in:
parent
361d294232
commit
8f1e5cad88
@ -1,149 +0,0 @@
|
||||
#! /usr/bin/python
|
||||
|
||||
from __future__ import with_statement
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import types
|
||||
import copy
|
||||
import itertools
|
||||
from collections import defaultdict
|
||||
from lxml import etree
|
||||
from stylizer import Page, Stylizer, Style
|
||||
|
||||
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
||||
XPNSMAP = {'h': XHTML_NS,}
|
||||
|
||||
class Splitter(object):
|
||||
XML_PARSER = etree.XMLParser(remove_blank_text=True)
|
||||
COLLAPSE = re.compile(r'[ \n\r]+')
|
||||
CONTENT_TAGS = set(['img', 'object', 'embed'])
|
||||
for tag in list(CONTENT_TAGS):
|
||||
CONTENT_TAGS.add('{%s}%s' % (XHTML_NS, tag))
|
||||
|
||||
def __init__(self, path):
|
||||
with open(path, 'rb') as f:
|
||||
self.tree = etree.parse(f, parser=self.XML_PARSER)
|
||||
self.stylizer = Stylizer(self.tree, path)
|
||||
self.path = path
|
||||
self.basename = os.path.splitext(
|
||||
os.path.basename(path))[0].lower()
|
||||
self.splits = []
|
||||
self.names = []
|
||||
self.idmap = {}
|
||||
self.fonts = defaultdict(int)
|
||||
self.content = False
|
||||
|
||||
def split(self):
|
||||
tree = self.tree
|
||||
for prefix in ('', 'h:'):
|
||||
d = {'h': prefix}
|
||||
roots = tree.xpath('/%(h)shtml' % d, namespaces=XPNSMAP)
|
||||
if roots: break
|
||||
self.root, = roots
|
||||
self.head, = tree.xpath('/%(h)shtml/%(h)shead' % d, namespaces=XPNSMAP)
|
||||
body, = tree.xpath('/%(h)shtml/%(h)sbody' % d, namespaces=XPNSMAP)
|
||||
self._split(body, [self.new_root(str(self.basename))], 9.0)
|
||||
results = zip(self.names, self.splits)
|
||||
self.post_process_links(results, d)
|
||||
return results
|
||||
|
||||
def new_root(self, name):
|
||||
nroot = self.dup(self.root)
|
||||
nroot.append(copy.deepcopy(self.head))
|
||||
self.splits.append(nroot)
|
||||
self.names.append(name + '.html')
|
||||
return nroot
|
||||
|
||||
def dup(self, e):
|
||||
new = etree.Element(e.tag, nsmap=e.nsmap, **dict(e.attrib))
|
||||
new.text = e.text
|
||||
new.tail = e.tail
|
||||
return new
|
||||
|
||||
def dupsub(self, p, e):
|
||||
new = etree.SubElement(p, e.tag, nsmap=e.nsmap, **dict(e.attrib))
|
||||
new.text = e.text
|
||||
new.tail = e.tail
|
||||
return new
|
||||
|
||||
def _split(self, src, dstq, psize):
|
||||
style = self.stylizer.style(src)
|
||||
if self.new_page(style, 'before'):
|
||||
self.new_split(src, dstq)
|
||||
attrib = src.attrib
|
||||
name = self.names[-1]
|
||||
for aname in ('id', 'name'):
|
||||
if aname in attrib:
|
||||
self.idmap[attrib[aname]] = name
|
||||
text = self.COLLAPSE.sub(' ', src.text or '')
|
||||
tail = self.COLLAPSE.sub(' ', src.text or '')
|
||||
if text or tail or src.tag.lower() in self.CONTENT_TAGS:
|
||||
self.content = True
|
||||
size = style['font-size']
|
||||
self.fonts[size] += len(text)
|
||||
self.fonts[psize] += len(tail)
|
||||
new = self.dupsub(dstq[-1], src)
|
||||
if len(src) > 0:
|
||||
dstq.append(new)
|
||||
for child in src:
|
||||
self._split(child, dstq, size)
|
||||
dstq.pop()
|
||||
if self.new_page(style, 'after'):
|
||||
self.new_split(src, dstq)
|
||||
|
||||
def new_page(self, style, when):
|
||||
if self.content \
|
||||
and (style['page-break-%s' % when] \
|
||||
in ('always', 'odd', 'even')):
|
||||
return True
|
||||
return False
|
||||
|
||||
def new_split(self, src, dstq):
|
||||
name = self.basename
|
||||
attrib = src.attrib
|
||||
if 'class' in attrib:
|
||||
name = src.attrib['class']
|
||||
if ' ' in name:
|
||||
name = name.split(' ', 2)[0]
|
||||
if 'id' in attrib:
|
||||
name = '%s-%s' % (name, attrib['id'])
|
||||
name = name.lower().replace('_', '-')
|
||||
if (name + '.html') in self.names:
|
||||
name = '%s-%02d' % (name, len(self.names))
|
||||
prev = None
|
||||
for i in xrange(len(dstq)):
|
||||
new = self.new_root(name) if prev is None \
|
||||
else self.dupsub(prev, dstq[i])
|
||||
prev = dstq[i] = new
|
||||
self.content = False
|
||||
|
||||
def post_process_links(self, results, prefixes):
|
||||
basename = os.path.basename(self.path)
|
||||
query = '//%(h)sa[@href]' % prefixes
|
||||
for name, root in results:
|
||||
elements = root.xpath(query, namespaces=XPNSMAP)
|
||||
for element in elements:
|
||||
href = element.attrib['href']
|
||||
if '#' not in href: continue
|
||||
fname, id = href.split('#', 2)
|
||||
if fname in ('', basename):
|
||||
href = '#'.join((self.idmap[id], id))
|
||||
element.attrib['href'] = href
|
||||
|
||||
def main():
|
||||
def xml2str(root):
|
||||
return etree.tostring(root, pretty_print=True,
|
||||
encoding='utf-8', xml_declaration=True)
|
||||
tree = None
|
||||
path = sys.argv[1]
|
||||
dest = sys.argv[2]
|
||||
splitter = Splitter(path)
|
||||
for name, root in splitter.split():
|
||||
print name
|
||||
with open(os.path.join(dest, name), 'wb') as f:
|
||||
f.write(xml2str(root))
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
Loading…
x
Reference in New Issue
Block a user