mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Untested implementation of HTML input. Uses a new transform that 'packages' an OEB book into a folder structure (the same folder structure that was used in the old codebase for EPUB output). This may have broken other thin gs, so use with care.
This commit is contained in:
parent
b2bfab32cf
commit
093b98a9f1
@ -122,8 +122,9 @@ class InputFormatPlugin(Plugin):
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
'''
|
||||
This method must be implemented in sub-classes. It must return
|
||||
the path to the created OPF file. All output should be contained in
|
||||
the current directory. If this plugin creates files outside the current
|
||||
the path to the created OPF file or an :class:`OEBBook` instance.
|
||||
All output should be contained in the current directory.
|
||||
If this plugin creates files outside the current
|
||||
directory they must be deleted/marked for deletion before this method
|
||||
returns.
|
||||
|
||||
|
@ -299,21 +299,15 @@ OptionRecommendation(name='language',
|
||||
|
||||
# Create an OEBBook from the input file. The input plugin does all the
|
||||
# heavy lifting.
|
||||
from calibre.ebooks.oeb.reader import OEBReader
|
||||
from calibre.ebooks.oeb.base import OEBBook
|
||||
accelerators = {}
|
||||
|
||||
tdir = PersistentTemporaryDirectory('_plumber')
|
||||
|
||||
opfpath = self.input_plugin(open(self.input, 'rb'), self.opts,
|
||||
self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts,
|
||||
self.input_fmt, self.log,
|
||||
accelerators, tdir)
|
||||
html_preprocessor = HTMLPreProcessor()
|
||||
self.reader = OEBReader()
|
||||
self.oeb = OEBBook(self.log, html_preprocessor=html_preprocessor)
|
||||
# Read OEB Book into OEBBook
|
||||
self.log.info('Parsing all content...')
|
||||
self.reader(self.oeb, opfpath)
|
||||
if not hasattr(self.oeb, 'manifest'):
|
||||
self.oeb = create_oebbook(self.log, self.oeb)
|
||||
|
||||
self.opts.source = self.opts.input_profile
|
||||
self.opts.dest = self.opts.output_profile
|
||||
@ -340,7 +334,20 @@ OptionRecommendation(name='language',
|
||||
trimmer(self.oeb, self.opts)
|
||||
|
||||
self.log.info('Creating %s...'%self.output_plugin.name)
|
||||
self.output_plugin.convert(self.oeb, self.output, self.input_plugin, self.opts,
|
||||
self.log)
|
||||
self.output_plugin.convert(self.oeb, self.output, self.input_plugin,
|
||||
self.opts, self.log)
|
||||
|
||||
def create_oebbook(log, opfpath):
|
||||
'''
|
||||
Create an OEBBook from an OPF file.
|
||||
'''
|
||||
from calibre.ebooks.oeb.reader import OEBReader
|
||||
from calibre.ebooks.oeb.base import OEBBook
|
||||
html_preprocessor = HTMLPreProcessor()
|
||||
reader = OEBReader()
|
||||
oeb = OEBBook(log, html_preprocessor=html_preprocessor)
|
||||
# Read OEB Book into OEBBook
|
||||
log.info('Parsing all content...')
|
||||
reader(oeb, opfpath)
|
||||
return oeb
|
||||
|
||||
|
@ -10,23 +10,23 @@ import sys, textwrap, re, os, uuid
|
||||
from itertools import cycle
|
||||
from calibre.utils.config import Config, StringConfig
|
||||
from calibre.utils.zipfile import ZipFile, ZIP_STORED
|
||||
from calibre.ebooks.html import config as common_config, tostring
|
||||
from calibre.ebooks.html import tostring
|
||||
from lxml import etree
|
||||
|
||||
class DefaultProfile(object):
|
||||
|
||||
|
||||
flow_size = sys.maxint
|
||||
screen_size = None
|
||||
remove_special_chars = False
|
||||
remove_object_tags = False
|
||||
|
||||
|
||||
class PRS505(DefaultProfile):
|
||||
|
||||
|
||||
flow_size = 270000
|
||||
screen_size = (590, 765)
|
||||
remove_special_chars = re.compile(u'[\u200b\u00ad]')
|
||||
remove_object_tags = True
|
||||
|
||||
|
||||
|
||||
PROFILES = {
|
||||
'PRS505' : PRS505,
|
||||
@ -64,11 +64,11 @@ def config(defaults=None, name='epub'):
|
||||
c = Config(name, desc)
|
||||
else:
|
||||
c = StringConfig(defaults, desc)
|
||||
|
||||
|
||||
c.update(common_config())
|
||||
c.remove_opt('output')
|
||||
c.remove_opt('zip')
|
||||
|
||||
|
||||
c.add_opt('output', ['-o', '--output'], default=None,
|
||||
help=_('The output EPUB file. If not specified, it is '
|
||||
'derived from the input file name.'))
|
||||
@ -81,22 +81,22 @@ def config(defaults=None, name='epub'):
|
||||
help=_('Either the path to a CSS stylesheet or raw CSS. '
|
||||
'This CSS will override any existing CSS '
|
||||
'declarations in the source files.'))
|
||||
structure = c.add_group('structure detection',
|
||||
structure = c.add_group('structure detection',
|
||||
_('Control auto-detection of document structure.'))
|
||||
structure('chapter', ['--chapter'],
|
||||
structure('chapter', ['--chapter'],
|
||||
default="//*[re:match(name(), 'h[1-2]') and "
|
||||
"re:test(., 'chapter|book|section|part', 'i')] | "
|
||||
"//*[@class = 'chapter']",
|
||||
help=_('''\
|
||||
An XPath expression to detect chapter titles. The default is to consider <h1> or
|
||||
<h2> tags that contain the words "chapter","book","section" or "part" as chapter titles as
|
||||
well as any tags that have class="chapter".
|
||||
<h2> tags that contain the words "chapter","book","section" or "part" as chapter titles as
|
||||
well as any tags that have class="chapter".
|
||||
The expression used must evaluate to a list of elements. To disable chapter detection,
|
||||
use the expression "/". See the XPath Tutorial in the calibre User Manual for further
|
||||
help on using this feature.
|
||||
''').replace('\n', ' '))
|
||||
structure('chapter_mark', ['--chapter-mark'], choices=['pagebreak', 'rule', 'both', 'none'],
|
||||
default='pagebreak',
|
||||
default='pagebreak',
|
||||
help=_('Specify how to mark detected chapters. A value of '
|
||||
'"pagebreak" will insert page breaks before chapters. '
|
||||
'A value of "rule" will insert a line before chapters. '
|
||||
@ -129,13 +129,13 @@ help on using this feature.
|
||||
help=_('XPath expression to find the name of each page in the '
|
||||
'pagination map relative to its boundary element. '
|
||||
'Default is to number all pages staring with 1.'))
|
||||
toc = c.add_group('toc',
|
||||
toc = c.add_group('toc',
|
||||
_('''\
|
||||
Control the automatic generation of a Table of Contents. If an OPF file is detected
|
||||
and it specifies a Table of Contents, then that will be used rather than trying
|
||||
to auto-generate a Table of Contents.
|
||||
''').replace('\n', ' '))
|
||||
toc('max_toc_links', ['--max-toc-links'], default=50,
|
||||
toc('max_toc_links', ['--max-toc-links'], default=50,
|
||||
help=_('Maximum number of links to insert into the TOC. Set to 0 '
|
||||
'to disable. Default is: %default. Links are only added to the '
|
||||
'TOC if less than the --toc-threshold number of chapters were detected.'))
|
||||
@ -166,15 +166,15 @@ to auto-generate a Table of Contents.
|
||||
help=_('Normally, if the source file already has a Table of Contents, '
|
||||
'it is used in preference to the auto-generated one. '
|
||||
'With this option, the auto-generated one is always used.'))
|
||||
|
||||
|
||||
layout = c.add_group('page layout', _('Control page layout'))
|
||||
layout('margin_top', ['--margin-top'], default=5.0,
|
||||
layout('margin_top', ['--margin-top'], default=5.0,
|
||||
help=_('Set the top margin in pts. Default is %default'))
|
||||
layout('margin_bottom', ['--margin-bottom'], default=5.0,
|
||||
layout('margin_bottom', ['--margin-bottom'], default=5.0,
|
||||
help=_('Set the bottom margin in pts. Default is %default'))
|
||||
layout('margin_left', ['--margin-left'], default=5.0,
|
||||
layout('margin_left', ['--margin-left'], default=5.0,
|
||||
help=_('Set the left margin in pts. Default is %default'))
|
||||
layout('margin_right', ['--margin-right'], default=5.0,
|
||||
layout('margin_right', ['--margin-right'], default=5.0,
|
||||
help=_('Set the right margin in pts. Default is %default'))
|
||||
layout('base_font_size2', ['--base-font-size'], default=12.0,
|
||||
help=_('The base font size in pts. Default is %defaultpt. '
|
||||
@ -195,12 +195,12 @@ to auto-generate a Table of Contents.
|
||||
'This is only neccessary if the HTML files contain CSS that '
|
||||
'uses sibling selectors. Enabling this greatly slows down '
|
||||
'processing of large HTML files.'))
|
||||
|
||||
|
||||
c.add_opt('show_opf', ['--show-opf'], default=False, group='debug',
|
||||
help=_('Print generated OPF file to stdout'))
|
||||
c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug',
|
||||
help=_('Print generated NCX file to stdout'))
|
||||
c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug',
|
||||
c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug',
|
||||
default=False,
|
||||
help=_('Keep intermediate files during processing by html2epub'))
|
||||
c.add_opt('extract_to', ['--extract-to'], group='debug', default=None,
|
||||
|
@ -14,7 +14,7 @@ from lxml.cssselect import CSSSelector
|
||||
from lxml import etree
|
||||
from lxml.html import HtmlElement
|
||||
|
||||
from calibre.ebooks.html import fromstring
|
||||
from calibre.ebooks.html_old import fromstring
|
||||
from calibre.ebooks.epub import rules
|
||||
from cssutils import CSSParser
|
||||
|
||||
@ -24,7 +24,7 @@ absolute_size = r'(?P<abs>(x?x-)?(small|large)|medium)'
|
||||
relative_size = r'(?P<rel>smaller|larger)'
|
||||
|
||||
font_size_pat = re.compile('|'.join((relative_size, absolute_size, length)), re.I)
|
||||
line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num))
|
||||
line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num))
|
||||
|
||||
PTU = {
|
||||
'in' : 72.,
|
||||
@ -37,12 +37,12 @@ PTU = {
|
||||
DEFAULT_FONT_SIZE = 12
|
||||
|
||||
class Rationalizer(object):
|
||||
|
||||
|
||||
@classmethod
|
||||
def specificity(cls, s):
|
||||
'''Map CSS specificity tuple to a single integer'''
|
||||
return sum([10**(4-i) + x for i,x in enumerate(s)])
|
||||
|
||||
return sum([10**(4-i) + x for i,x in enumerate(s)])
|
||||
|
||||
@classmethod
|
||||
def compute_font_size(cls, elem):
|
||||
'''
|
||||
@ -59,7 +59,7 @@ class Rationalizer(object):
|
||||
elem.computed_font_size = sfs(parent.computed_font_size)
|
||||
else:
|
||||
elem.computed_font_size = sfs
|
||||
|
||||
|
||||
@classmethod
|
||||
def calculate_font_size(cls, style):
|
||||
'Return font size in pts from style object. For relative units returns a callable'
|
||||
@ -69,7 +69,7 @@ class Rationalizer(object):
|
||||
fs = match.group()
|
||||
if style.fontSize:
|
||||
fs = style.fontSize
|
||||
|
||||
|
||||
match = font_size_pat.search(fs)
|
||||
if match is None:
|
||||
return None
|
||||
@ -89,8 +89,8 @@ class Rationalizer(object):
|
||||
return 12 * x
|
||||
if match.get('zero', False):
|
||||
return 0.
|
||||
return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8)
|
||||
|
||||
return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8)
|
||||
|
||||
@classmethod
|
||||
def resolve_rules(cls, stylesheets):
|
||||
for sheet in stylesheets:
|
||||
@ -104,12 +104,12 @@ class Rationalizer(object):
|
||||
if font_size is not None:
|
||||
for s in r.selectorList:
|
||||
sheet.fs_rules.append([CSSSelector(s.selectorText), font_size])
|
||||
orig = line_height_pat.search(r.style.lineHeight)
|
||||
orig = line_height_pat.search(r.style.lineHeight)
|
||||
if orig is not None:
|
||||
for s in r.selectorList:
|
||||
sheet.lh_rules.append([CSSSelector(s.selectorText), float(orig.group(1)) * PTU[orig.group(2).lower()]])
|
||||
|
||||
|
||||
|
||||
|
||||
@classmethod
|
||||
def apply_font_size_rules(cls, stylesheets, root):
|
||||
'Add a ``specified_font_size`` attribute to every element that has a specified font size'
|
||||
@ -119,7 +119,7 @@ class Rationalizer(object):
|
||||
elems = selector(root)
|
||||
for elem in elems:
|
||||
elem.specified_font_size = font_size
|
||||
|
||||
|
||||
@classmethod
|
||||
def remove_font_size_information(cls, stylesheets):
|
||||
for r in rules(stylesheets):
|
||||
@ -134,17 +134,17 @@ class Rationalizer(object):
|
||||
r.style.removeProperty('font')
|
||||
if line_height_pat.search(r.style.lineHeight) is not None:
|
||||
r.style.removeProperty('line-height')
|
||||
|
||||
|
||||
@classmethod
|
||||
def compute_font_sizes(cls, root, stylesheets, base=12):
|
||||
stylesheets = [s for s in stylesheets if hasattr(s, 'cssText')]
|
||||
cls.apply_font_size_rules(stylesheets, root)
|
||||
|
||||
|
||||
# Compute the effective font size of all tags
|
||||
root.computed_font_size = DEFAULT_FONT_SIZE
|
||||
for elem in root.iter(etree.Element):
|
||||
cls.compute_font_size(elem)
|
||||
|
||||
|
||||
extra_css = {}
|
||||
if base > 0:
|
||||
# Calculate the "base" (i.e. most common) font size
|
||||
@ -157,20 +157,20 @@ class Rationalizer(object):
|
||||
if t: t = t.strip()
|
||||
if t:
|
||||
font_sizes[elem.computed_font_size] += len(t)
|
||||
|
||||
|
||||
t = getattr(elem, 'tail', '')
|
||||
if t: t = t.strip()
|
||||
if t:
|
||||
parent = elem.getparent()
|
||||
if parent.tag not in IGNORE:
|
||||
font_sizes[parent.computed_font_size] += len(t)
|
||||
|
||||
|
||||
try:
|
||||
most_common = max(font_sizes.items(), key=operator.itemgetter(1))[0]
|
||||
scale = base/most_common if most_common > 0 else 1.
|
||||
except ValueError:
|
||||
scale = 1.
|
||||
|
||||
|
||||
# rescale absolute line-heights
|
||||
counter = 0
|
||||
for sheet in stylesheets:
|
||||
@ -181,17 +181,17 @@ class Rationalizer(object):
|
||||
if not extra_css.has_key(elem.get('id')):
|
||||
extra_css[elem.get('id')] = []
|
||||
extra_css[elem.get('id')].append('line-height:%fpt'%(lh*scale))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Rescale all computed font sizes
|
||||
for elem in body.iter(etree.Element):
|
||||
if isinstance(elem, HtmlElement):
|
||||
elem.computed_font_size *= scale
|
||||
|
||||
# Remove all font size specifications from the last stylesheet
|
||||
|
||||
# Remove all font size specifications from the last stylesheet
|
||||
cls.remove_font_size_information(stylesheets[-1:])
|
||||
|
||||
|
||||
# Create the CSS to implement the rescaled font sizes
|
||||
for elem in body.iter(etree.Element):
|
||||
cfs, pcfs = map(operator.attrgetter('computed_font_size'), (elem, elem.getparent()))
|
||||
@ -201,12 +201,12 @@ class Rationalizer(object):
|
||||
if not extra_css.has_key(elem.get('id')):
|
||||
extra_css[elem.get('id')] = []
|
||||
extra_css[elem.get('id')].append('font-size: %f%%'%(100*(cfs/pcfs)))
|
||||
|
||||
|
||||
css = CSSParser(loglevel=logging.ERROR).parseString('')
|
||||
for id, r in extra_css.items():
|
||||
css.add('#%s {%s}'%(id, ';'.join(r)))
|
||||
return css
|
||||
|
||||
|
||||
@classmethod
|
||||
def rationalize(cls, stylesheets, root, opts):
|
||||
logger = logging.getLogger('html2epub')
|
||||
@ -229,7 +229,7 @@ class Rationalizer(object):
|
||||
################################################################################
|
||||
|
||||
class FontTest(unittest.TestCase):
|
||||
|
||||
|
||||
def setUp(self):
|
||||
from calibre.ebooks.epub import config
|
||||
self.opts = config(defaults='').parse()
|
||||
@ -246,10 +246,10 @@ class FontTest(unittest.TestCase):
|
||||
<p id="p2">Some other <span class="it">text</span>.</p>
|
||||
<p id="longest">The longest piece of single font size text in this entire file. Used to test resizing.</p>
|
||||
</body>
|
||||
</html>
|
||||
</html>
|
||||
'''
|
||||
self.root = fromstring(self.html)
|
||||
|
||||
|
||||
def do_test(self, css, base=DEFAULT_FONT_SIZE, scale=1):
|
||||
root1 = copy.deepcopy(self.root)
|
||||
root1.computed_font_size = DEFAULT_FONT_SIZE
|
||||
@ -262,39 +262,39 @@ class FontTest(unittest.TestCase):
|
||||
for elem in root2.iter(etree.Element):
|
||||
Rationalizer.compute_font_size(elem)
|
||||
for e1, e2 in zip(root1.xpath('//body')[0].iter(etree.Element), root2.xpath('//body')[0].iter(etree.Element)):
|
||||
self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size,
|
||||
self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size,
|
||||
msg='Computed font sizes for %s not equal. Original: %f Processed: %f'%\
|
||||
(root1.getroottree().getpath(e1), e1.computed_font_size, e2.computed_font_size))
|
||||
return stylesheet2.cssText
|
||||
|
||||
|
||||
def testStripping(self):
|
||||
'Test that any original entries are removed from the CSS'
|
||||
css = 'p { font: bold 10px italic smaller; font-size: x-large} \na { font-size: 0 }'
|
||||
css = CSSParser(loglevel=logging.ERROR).parseString(css)
|
||||
Rationalizer.compute_font_sizes(copy.deepcopy(self.root), [css])
|
||||
self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''),
|
||||
self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''),
|
||||
'p{font:bolditalic}')
|
||||
|
||||
|
||||
def testIdentity(self):
|
||||
'Test that no unnecessary font size changes are made'
|
||||
extra_css = self.do_test('div {font-size:12pt} \nspan {font-size:100%}')
|
||||
self.assertEqual(extra_css.strip(), '')
|
||||
|
||||
|
||||
def testRelativization(self):
|
||||
'Test conversion of absolute to relative sizes'
|
||||
self.do_test('#p1 {font: 24pt} b {font: 12pt} .it {font: 48pt} #p2 {font: 100%}')
|
||||
|
||||
|
||||
def testResizing(self):
|
||||
'Test resizing of fonts'
|
||||
self.do_test('#longest {font: 24pt} .it {font:20pt; line-height:22pt}')
|
||||
|
||||
|
||||
|
||||
def suite():
|
||||
return unittest.TestLoader().loadTestsFromTestCase(FontTest)
|
||||
|
||||
|
||||
def test():
|
||||
unittest.TextTestRunner(verbosity=2).run(suite())
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(test())
|
||||
|
||||
sys.exit(test())
|
||||
|
||||
|
@ -38,7 +38,7 @@ from lxml.etree import XPath
|
||||
from lxml import html, etree
|
||||
from PyQt4.Qt import QApplication, QPixmap
|
||||
|
||||
from calibre.ebooks.html import Processor, merge_metadata, get_filelist,\
|
||||
from calibre.ebooks.html_old import Processor, merge_metadata, get_filelist,\
|
||||
opf_traverse, create_metadata, rebase_toc, Link, parser
|
||||
from calibre.ebooks.epub import config as common_config, tostring
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
|
@ -16,7 +16,7 @@ from calibre.ebooks.epub import config
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.html import create_dir
|
||||
from calibre.ebooks.html_old import create_dir
|
||||
from calibre.utils.zipfile import safe_replace, ZipFile
|
||||
from calibre.utils.config import DynamicConfig
|
||||
|
||||
|
@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
|
||||
Split the flows in an epub file to conform to size limitations.
|
||||
'''
|
||||
|
||||
import os, math, logging, functools, collections, re, copy, sys
|
||||
import os, math, functools, collections, re, copy, sys
|
||||
|
||||
from lxml.etree import XPath as _XPath
|
||||
from lxml import etree, html
|
||||
@ -24,16 +24,16 @@ SPLIT_ATTR = 'cs'
|
||||
SPLIT_POINT_ATTR = 'csp'
|
||||
|
||||
class SplitError(ValueError):
|
||||
|
||||
|
||||
def __init__(self, path, root):
|
||||
size = len(tostring(root))/1024.
|
||||
ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')%
|
||||
ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')%
|
||||
(os.path.basename(path), size))
|
||||
|
||||
|
||||
|
||||
|
||||
class Splitter(object):
|
||||
|
||||
|
||||
def __init__(self, path, opts, stylesheet_map, opf):
|
||||
self.setup_cli_handler(opts.verbose)
|
||||
self.path = path
|
||||
@ -44,10 +44,10 @@ class Splitter(object):
|
||||
self.orig_size = os.stat(content(path)).st_size
|
||||
self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.)
|
||||
root = html.fromstring(open(content(path)).read())
|
||||
|
||||
|
||||
self.page_breaks, self.trees = [], []
|
||||
self.split_size = 0
|
||||
|
||||
|
||||
# Split on page breaks
|
||||
self.splitting_on_page_breaks = True
|
||||
if not opts.dont_split_on_page_breaks:
|
||||
@ -59,7 +59,7 @@ class Splitter(object):
|
||||
else:
|
||||
self.trees = [root.getroottree()]
|
||||
trees = list(self.trees)
|
||||
|
||||
|
||||
# Split any remaining over-sized trees
|
||||
self.splitting_on_page_breaks = False
|
||||
if self.opts.profile.flow_size < sys.maxint:
|
||||
@ -67,7 +67,7 @@ class Splitter(object):
|
||||
self.log_info('\tLooking for large trees...')
|
||||
for i, tree in enumerate(list(trees)):
|
||||
self.trees = []
|
||||
size = len(tostring(tree.getroot()))
|
||||
size = len(tostring(tree.getroot()))
|
||||
if size > self.opts.profile.flow_size:
|
||||
lt_found = True
|
||||
try:
|
||||
@ -81,7 +81,7 @@ class Splitter(object):
|
||||
trees[i:i+1] = list(self.trees)
|
||||
if not lt_found:
|
||||
self.log_info('\tNo large trees found')
|
||||
|
||||
|
||||
self.trees = trees
|
||||
self.was_split = len(self.trees) > 1
|
||||
if self.was_split:
|
||||
@ -91,17 +91,17 @@ class Splitter(object):
|
||||
for f in self.files:
|
||||
self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
|
||||
self.fix_opf(opf)
|
||||
|
||||
|
||||
self.trees = None
|
||||
|
||||
|
||||
|
||||
|
||||
def split_text(self, text, root, size):
|
||||
self.log_debug('\t\t\tSplitting text of length: %d'%len(text))
|
||||
rest = text.replace('\r', '')
|
||||
parts = re.split('\n\n', rest)
|
||||
self.log_debug('\t\t\t\tFound %d parts'%len(parts))
|
||||
if max(map(len, parts)) > size:
|
||||
raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root)
|
||||
raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root)
|
||||
ans = []
|
||||
buf = ''
|
||||
for part in parts:
|
||||
@ -111,8 +111,8 @@ class Splitter(object):
|
||||
ans.append(buf)
|
||||
buf = part
|
||||
return ans
|
||||
|
||||
|
||||
|
||||
|
||||
def split_to_size(self, tree):
|
||||
self.log_debug('\t\tSplitting...')
|
||||
root = tree.getroot()
|
||||
@ -134,7 +134,7 @@ class Splitter(object):
|
||||
p = pre.getparent()
|
||||
i = p.index(pre)
|
||||
p[i:i+1] = new_pres
|
||||
|
||||
|
||||
split_point, before = self.find_split_point(root)
|
||||
if split_point is None or self.split_size > 6*self.orig_size:
|
||||
if not self.always_remove:
|
||||
@ -142,7 +142,7 @@ class Splitter(object):
|
||||
'structure preservation. This may cause '
|
||||
'incorrect rendering.'))
|
||||
raise SplitError(self.path, root)
|
||||
|
||||
|
||||
for t in self.do_split(tree, split_point, before):
|
||||
r = t.getroot()
|
||||
if self.is_page_empty(r):
|
||||
@ -151,12 +151,12 @@ class Splitter(object):
|
||||
if size <= self.opts.profile.flow_size:
|
||||
self.trees.append(t)
|
||||
#print tostring(t.getroot(), pretty_print=True)
|
||||
self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)',
|
||||
self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)',
|
||||
len(self.trees), size/1024.)
|
||||
self.split_size += size
|
||||
else:
|
||||
self.split_to_size(t)
|
||||
|
||||
|
||||
def is_page_empty(self, root):
|
||||
body = root.find('body')
|
||||
if body is None:
|
||||
@ -170,14 +170,14 @@ class Splitter(object):
|
||||
if img.get('style', '') != 'display:none':
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def do_split(self, tree, split_point, before):
|
||||
'''
|
||||
Split ``tree`` into a *before* and *after* tree at ``split_point``,
|
||||
preserving tag structure, but not duplicating any text.
|
||||
Split ``tree`` into a *before* and *after* tree at ``split_point``,
|
||||
preserving tag structure, but not duplicating any text.
|
||||
All tags that have had their text and tail
|
||||
removed have the attribute ``calibre_split`` set to 1.
|
||||
|
||||
|
||||
:param before: If True tree is split before split_point, otherwise after split_point
|
||||
:return: before_tree, after_tree
|
||||
'''
|
||||
@ -188,7 +188,7 @@ class Splitter(object):
|
||||
body, body2 = root.body, root2.body
|
||||
split_point = root.xpath(path)[0]
|
||||
split_point2 = root2.xpath(path)[0]
|
||||
|
||||
|
||||
def nix_element(elem, top=True):
|
||||
if self.always_remove:
|
||||
parent = elem.getparent()
|
||||
@ -198,18 +198,18 @@ class Splitter(object):
|
||||
else:
|
||||
index = parent.index(elem)
|
||||
parent[index:index+1] = list(elem.iterchildren())
|
||||
|
||||
|
||||
else:
|
||||
elem.text = u''
|
||||
elem.tail = u''
|
||||
elem.set(SPLIT_ATTR, '1')
|
||||
if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']:
|
||||
elem.set('style', 'display:none')
|
||||
|
||||
|
||||
def fix_split_point(sp):
|
||||
if not self.splitting_on_page_breaks:
|
||||
sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid')
|
||||
|
||||
sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid')
|
||||
|
||||
# Tree 1
|
||||
hit_split_point = False
|
||||
for elem in list(body.iterdescendants(etree.Element)):
|
||||
@ -223,8 +223,8 @@ class Splitter(object):
|
||||
continue
|
||||
if hit_split_point:
|
||||
nix_element(elem)
|
||||
|
||||
|
||||
|
||||
|
||||
# Tree 2
|
||||
hit_split_point = False
|
||||
for elem in list(body2.iterdescendants(etree.Element)):
|
||||
@ -238,17 +238,17 @@ class Splitter(object):
|
||||
continue
|
||||
if not hit_split_point:
|
||||
nix_element(elem, top=False)
|
||||
|
||||
|
||||
return tree, tree2
|
||||
|
||||
|
||||
|
||||
|
||||
def split_on_page_breaks(self, orig_tree):
|
||||
ordered_ids = []
|
||||
for elem in orig_tree.xpath('//*[@id]'):
|
||||
id = elem.get('id')
|
||||
if id in self.page_break_ids:
|
||||
ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
|
||||
|
||||
|
||||
self.trees = []
|
||||
tree = orig_tree
|
||||
for pattern, before in ordered_ids:
|
||||
@ -260,13 +260,13 @@ class Splitter(object):
|
||||
tree = after
|
||||
self.trees.append(tree)
|
||||
self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def find_page_breaks(self, stylesheets, root):
|
||||
'''
|
||||
Find all elements that have either page-break-before or page-break-after set.
|
||||
Populates `self.page_breaks` with id based XPath selectors (for elements that don't
|
||||
Populates `self.page_breaks` with id based XPath selectors (for elements that don't
|
||||
have ids, an id is created).
|
||||
'''
|
||||
page_break_selectors = set([])
|
||||
@ -283,16 +283,16 @@ class Splitter(object):
|
||||
page_break_selectors.add((CSSSelector(rule.selectorText), False))
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
page_breaks = set([])
|
||||
for selector, before in page_break_selectors:
|
||||
for elem in selector(root):
|
||||
elem.pb_before = before
|
||||
page_breaks.add(elem)
|
||||
|
||||
|
||||
for i, elem in enumerate(root.iter()):
|
||||
elem.pb_order = i
|
||||
|
||||
|
||||
page_breaks = list(page_breaks)
|
||||
page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
|
||||
self.page_break_ids = []
|
||||
@ -300,12 +300,12 @@ class Splitter(object):
|
||||
x.set('id', x.get('id', 'calibre_pb_%d'%i))
|
||||
id = x.get('id')
|
||||
self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before))
|
||||
self.page_break_ids.append(id)
|
||||
|
||||
|
||||
self.page_break_ids.append(id)
|
||||
|
||||
|
||||
def find_split_point(self, root):
|
||||
'''
|
||||
Find the tag at which to split the tree rooted at `root`.
|
||||
Find the tag at which to split the tree rooted at `root`.
|
||||
Search order is:
|
||||
* Heading tags
|
||||
* <div> tags
|
||||
@ -314,7 +314,7 @@ class Splitter(object):
|
||||
* <p> tags
|
||||
* <br> tags
|
||||
* <li> tags
|
||||
|
||||
|
||||
We try to split in the "middle" of the file (as defined by tag counts.
|
||||
'''
|
||||
def pick_elem(elems):
|
||||
@ -325,18 +325,18 @@ class Splitter(object):
|
||||
i = int(math.floor(len(elems)/2.))
|
||||
elems[i].set(SPLIT_POINT_ATTR, '1')
|
||||
return elems[i]
|
||||
|
||||
|
||||
for path in (
|
||||
'//*[re:match(name(), "h[1-6]", "i")]',
|
||||
'//*[re:match(name(), "h[1-6]", "i")]',
|
||||
'/html/body/div',
|
||||
'//pre',
|
||||
'//hr',
|
||||
'//hr',
|
||||
'//p',
|
||||
'//div',
|
||||
'//br',
|
||||
'//li',
|
||||
):
|
||||
elems = root.xpath(path,
|
||||
elems = root.xpath(path,
|
||||
namespaces={'re':'http://exslt.org/regular-expressions'})
|
||||
elem = pick_elem(elems)
|
||||
if elem is not None:
|
||||
@ -345,9 +345,9 @@ class Splitter(object):
|
||||
except:
|
||||
continue
|
||||
return elem, True
|
||||
|
||||
|
||||
return None, True
|
||||
|
||||
|
||||
def commit(self):
|
||||
'''
|
||||
Commit all changes caused by the split. This removes the previously
|
||||
@ -357,7 +357,7 @@ class Splitter(object):
|
||||
'''
|
||||
self.anchor_map = collections.defaultdict(lambda :self.base%0)
|
||||
self.files = []
|
||||
|
||||
|
||||
for i, tree in enumerate(self.trees):
|
||||
root = tree.getroot()
|
||||
self.files.append(self.base%i)
|
||||
@ -367,7 +367,7 @@ class Splitter(object):
|
||||
for elem in root.xpath('//*[@%s or @%s]'%(SPLIT_ATTR, SPLIT_POINT_ATTR)):
|
||||
elem.attrib.pop(SPLIT_ATTR, None)
|
||||
elem.attrib.pop(SPLIT_POINT_ATTR, '0')
|
||||
|
||||
|
||||
for current, tree in zip(self.files, self.trees):
|
||||
for a in tree.getroot().xpath('//a[@href]'):
|
||||
href = a.get('href').strip()
|
||||
@ -375,10 +375,10 @@ class Splitter(object):
|
||||
anchor = href[1:]
|
||||
file = self.anchor_map[anchor]
|
||||
if file != current:
|
||||
a.set('href', file+href)
|
||||
a.set('href', file+href)
|
||||
open(content(current), 'wb').\
|
||||
write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print))
|
||||
|
||||
|
||||
os.remove(content(self.path))
|
||||
|
||||
|
||||
@ -391,12 +391,12 @@ class Splitter(object):
|
||||
id_map = {}
|
||||
for item in items:
|
||||
id_map[item.get('id')] = opf.replace_manifest_item(item, new_items)
|
||||
|
||||
|
||||
for id in id_map.keys():
|
||||
opf.replace_spine_items_by_idref(id, id_map[id])
|
||||
|
||||
|
||||
for ref in opf.iterguide():
|
||||
href = ref.get('href', '')
|
||||
href = ref.get('href', '')
|
||||
if href.startswith('content/'+self.path):
|
||||
href = href.split('#')
|
||||
frag = None
|
||||
@ -408,8 +408,8 @@ class Splitter(object):
|
||||
new_file = self.anchor_map[frag]
|
||||
ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag)))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def fix_content_links(html_files, changes, opts):
|
||||
split_files = [f.path for f in changes]
|
||||
anchor_maps = [f.anchor_map for f in changes]
|
||||
@ -420,7 +420,7 @@ def fix_content_links(html_files, changes, opts):
|
||||
files[i:i+1] = changes[j].files
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
|
||||
for htmlfile in files:
|
||||
changed = False
|
||||
root = html.fromstring(open(content(htmlfile), 'rb').read())
|
||||
@ -439,7 +439,7 @@ def fix_content_links(html_files, changes, opts):
|
||||
frag = ('#'+anchor) if anchor else ''
|
||||
a.set('href', newf+frag)
|
||||
changed = True
|
||||
|
||||
|
||||
if changed:
|
||||
open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
|
||||
|
||||
@ -448,7 +448,7 @@ def fix_ncx(path, changes):
|
||||
anchor_maps = [f.anchor_map for f in changes]
|
||||
tree = etree.parse(path)
|
||||
changed = False
|
||||
for content in tree.getroot().xpath('//x:content[@src]',
|
||||
for content in tree.getroot().xpath('//x:content[@src]',
|
||||
namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}):
|
||||
href = content.get('src')
|
||||
if not href.startswith('#'):
|
||||
@ -481,21 +481,21 @@ def find_html_files(opf):
|
||||
if os.path.exists(content(f)):
|
||||
html_files.append(f)
|
||||
return html_files
|
||||
|
||||
|
||||
|
||||
def split(pathtoopf, opts, stylesheet_map):
|
||||
pathtoopf = os.path.abspath(pathtoopf)
|
||||
opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
|
||||
|
||||
|
||||
with CurrentDir(os.path.dirname(pathtoopf)):
|
||||
html_files = find_html_files(opf)
|
||||
changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files]
|
||||
changes = [c for c in changes if c.was_split]
|
||||
|
||||
|
||||
fix_content_links(html_files, changes, opts)
|
||||
for item in opf.itermanifest():
|
||||
if item.get('media-type', '') == 'application/x-dtbncx+xml':
|
||||
fix_ncx(item.get('href'), changes)
|
||||
break
|
||||
break
|
||||
|
||||
open(pathtoopf, 'wb').write(opf.render())
|
||||
|
30
src/calibre/ebooks/html/__init__.py
Normal file
30
src/calibre/ebooks/html/__init__.py
Normal file
@ -0,0 +1,30 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import with_statement
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
from lxml.etree import tostring as _tostring
|
||||
|
||||
def tostring(root, strip_comments=False, pretty_print=False):
|
||||
'''
|
||||
Serialize processed XHTML.
|
||||
'''
|
||||
root.set('xmlns', 'http://www.w3.org/1999/xhtml')
|
||||
root.set('{http://www.w3.org/1999/xhtml}xlink', 'http://www.w3.org/1999/xlink')
|
||||
for x in root.iter():
|
||||
if x.tag.rpartition('}')[-1].lower() == 'svg':
|
||||
x.set('xmlns', 'http://www.w3.org/2000/svg')
|
||||
|
||||
ans = _tostring(root, encoding='utf-8', pretty_print=pretty_print)
|
||||
if strip_comments:
|
||||
ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans)
|
||||
ans = '<?xml version="1.0" encoding="utf-8" ?>\n'+ans
|
||||
|
||||
return ans
|
||||
|
||||
|
342
src/calibre/ebooks/html/input.py
Normal file
342
src/calibre/ebooks/html/input.py
Normal file
@ -0,0 +1,342 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import with_statement
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
'''
|
||||
Input plugin for HTML or OPF ebooks.
|
||||
'''
|
||||
|
||||
import os, re, sys, cStringIO
|
||||
from urlparse import urlparse, urlunparse
|
||||
from urllib import unquote
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.customize.conversion import OptionRecommendation
|
||||
from calibre import unicode_path
|
||||
|
||||
class Link(object):
|
||||
'''
|
||||
Represents a link in a HTML file.
|
||||
'''
|
||||
|
||||
@classmethod
|
||||
def url_to_local_path(cls, url, base):
|
||||
path = urlunparse(('', '', url.path, url.params, url.query, ''))
|
||||
path = unquote(path)
|
||||
if os.path.isabs(path):
|
||||
return path
|
||||
return os.path.abspath(os.path.join(base, path))
|
||||
|
||||
def __init__(self, url, base):
|
||||
'''
|
||||
:param url: The url this link points to. Must be an unquoted unicode string.
|
||||
:param base: The base directory that relative URLs are with respect to.
|
||||
Must be a unicode string.
|
||||
'''
|
||||
assert isinstance(url, unicode) and isinstance(base, unicode)
|
||||
self.url = url
|
||||
self.parsed_url = urlparse(self.url)
|
||||
self.is_local = self.parsed_url.scheme in ('', 'file')
|
||||
self.is_internal = self.is_local and not bool(self.parsed_url.path)
|
||||
self.path = None
|
||||
self.fragment = unquote(self.parsed_url.fragment)
|
||||
if self.is_local and not self.is_internal:
|
||||
self.path = self.url_to_local_path(self.parsed_url, base)
|
||||
|
||||
def __hash__(self):
|
||||
if self.path is None:
|
||||
return hash(self.url)
|
||||
return hash(self.path)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.path == getattr(other, 'path', other)
|
||||
|
||||
def __str__(self):
|
||||
return u'Link: %s --> %s'%(self.url, self.path)
|
||||
|
||||
|
||||
class IgnoreFile(Exception):
|
||||
|
||||
def __init__(self, msg, errno):
|
||||
Exception.__init__(self, msg)
|
||||
self.doesnt_exist = errno == 2
|
||||
self.errno = errno
|
||||
|
||||
class HTMLFile(object):
|
||||
'''
|
||||
Contains basic information about an HTML file. This
|
||||
includes a list of links to other files as well as
|
||||
the encoding of each file. Also tries to detect if the file is not a HTML
|
||||
file in which case :member:`is_binary` is set to True.
|
||||
|
||||
The encoding of the file is available as :member:`encoding`.
|
||||
'''
|
||||
|
||||
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
|
||||
TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
|
||||
LINK_PAT = re.compile(
|
||||
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
|
||||
re.DOTALL|re.IGNORECASE)
|
||||
|
||||
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
|
||||
'''
|
||||
:param level: The level of this file. Should be 0 for the root file.
|
||||
:param encoding: Use `encoding` to decode HTML.
|
||||
:param referrer: The :class:`HTMLFile` that first refers to this file.
|
||||
'''
|
||||
self.path = unicode_path(path_to_html_file, abs=True)
|
||||
self.title = os.path.splitext(os.path.basename(self.path))[0]
|
||||
self.base = os.path.dirname(self.path)
|
||||
self.level = level
|
||||
self.referrer = referrer
|
||||
self.links = []
|
||||
|
||||
try:
|
||||
with open(self.path, 'rb') as f:
|
||||
src = f.read()
|
||||
except IOError, err:
|
||||
msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
|
||||
if level == 0:
|
||||
raise IOError(msg)
|
||||
raise IgnoreFile(msg, err.errno)
|
||||
|
||||
self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
|
||||
if not self.is_binary:
|
||||
if encoding is None:
|
||||
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
|
||||
self.encoding = encoding
|
||||
else:
|
||||
self.encoding = encoding
|
||||
|
||||
src = src.decode(encoding, 'replace')
|
||||
match = self.TITLE_PAT.search(src)
|
||||
self.title = match.group(1) if match is not None else self.title
|
||||
self.find_links(src)
|
||||
|
||||
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.path == getattr(other, 'path', other)
|
||||
|
||||
def __str__(self):
|
||||
return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
|
||||
def find_links(self, src):
|
||||
for match in self.LINK_PAT.finditer(src):
|
||||
url = None
|
||||
for i in ('url1', 'url2', 'url3'):
|
||||
url = match.group(i)
|
||||
if url:
|
||||
break
|
||||
link = self.resolve(url)
|
||||
if link not in self.links:
|
||||
self.links.append(link)
|
||||
|
||||
def resolve(self, url):
|
||||
return Link(url, self.base)
|
||||
|
||||
|
||||
def depth_first(root, flat, visited=set([])):
|
||||
yield root
|
||||
visited.add(root)
|
||||
for link in root.links:
|
||||
if link.path is not None and link not in visited:
|
||||
try:
|
||||
index = flat.index(link)
|
||||
except ValueError: # Can happen if max_levels is used
|
||||
continue
|
||||
hf = flat[index]
|
||||
if hf not in visited:
|
||||
yield hf
|
||||
visited.add(hf)
|
||||
for hf in depth_first(hf, flat, visited):
|
||||
if hf not in visited:
|
||||
yield hf
|
||||
visited.add(hf)
|
||||
|
||||
|
||||
def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
|
||||
'''
|
||||
Recursively traverse all links in the HTML file.
|
||||
|
||||
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
|
||||
implies that no links in the root HTML file are followed.
|
||||
:param encoding: Specify character encoding of HTML files. If `None` it is
|
||||
auto-detected.
|
||||
:return: A pair of lists (breadth_first, depth_first). Each list contains
|
||||
:class:`HTMLFile` objects.
|
||||
'''
|
||||
assert max_levels >= 0
|
||||
level = 0
|
||||
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
|
||||
next_level = list(flat)
|
||||
while level < max_levels and len(next_level) > 0:
|
||||
level += 1
|
||||
nl = []
|
||||
for hf in next_level:
|
||||
rejects = []
|
||||
for link in hf.links:
|
||||
if link.path is None or link.path in flat:
|
||||
continue
|
||||
try:
|
||||
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
|
||||
if nf.is_binary:
|
||||
raise IgnoreFile('%s is a binary file'%nf.path, -1)
|
||||
nl.append(nf)
|
||||
flat.append(nf)
|
||||
except IgnoreFile, err:
|
||||
rejects.append(link)
|
||||
if not err.doesnt_exist or verbose > 1:
|
||||
print repr(err)
|
||||
for link in rejects:
|
||||
hf.links.remove(link)
|
||||
|
||||
next_level = list(nl)
|
||||
orec = sys.getrecursionlimit()
|
||||
sys.setrecursionlimit(500000)
|
||||
try:
|
||||
return flat, list(depth_first(flat[0], flat))
|
||||
finally:
|
||||
sys.setrecursionlimit(orec)
|
||||
|
||||
|
||||
def opf_traverse(opf_reader, verbose=0, encoding=None):
|
||||
'''
|
||||
Return a list of :class:`HTMLFile` objects in the order specified by the
|
||||
`<spine>` element of the OPF.
|
||||
|
||||
:param opf_reader: An :class:`calibre.ebooks.metadata.opf2.OPF` instance.
|
||||
:param encoding: Specify character encoding of HTML files. If `None` it is
|
||||
auto-detected.
|
||||
'''
|
||||
if not opf_reader.spine:
|
||||
raise ValueError('OPF does not have a spine')
|
||||
flat = []
|
||||
for path in opf_reader.spine.items():
|
||||
path = os.path.abspath(path)
|
||||
if path not in flat:
|
||||
flat.append(os.path.abspath(path))
|
||||
for item in opf_reader.manifest:
|
||||
if 'html' in item.mime_type:
|
||||
path = os.path.abspath(item.path)
|
||||
if path not in flat:
|
||||
flat.append(path)
|
||||
for i, path in enumerate(flat):
|
||||
if not os.path.exists(path):
|
||||
path = path.replace('&', '%26')
|
||||
if os.path.exists(path):
|
||||
flat[i] = path
|
||||
for item in opf_reader.itermanifest():
|
||||
item.set('href', item.get('href').replace('&', '%26'))
|
||||
ans = []
|
||||
for path in flat:
|
||||
if os.path.exists(path):
|
||||
ans.append(HTMLFile(path, 0, encoding, verbose))
|
||||
else:
|
||||
print 'WARNING: OPF spine item %s does not exist'%path
|
||||
ans = [f for f in ans if not f.is_binary]
|
||||
return ans
|
||||
|
||||
def search_for_opf(dir):
|
||||
for f in os.listdir(dir):
|
||||
if f.lower().endswith('.opf'):
|
||||
return OPF(open(os.path.join(dir, f), 'rb'), dir)
|
||||
|
||||
def get_filelist(htmlfile, dir, opts, log):
|
||||
'''
|
||||
Build list of files referenced by html file or try to detect and use an
|
||||
OPF file instead.
|
||||
'''
|
||||
print 'Building file list...'
|
||||
opf = search_for_opf(dir)
|
||||
filelist = None
|
||||
if opf is not None:
|
||||
try:
|
||||
filelist = opf_traverse(opf, verbose=opts.verbose,
|
||||
encoding=opts.input_encoding)
|
||||
except:
|
||||
pass
|
||||
if not filelist:
|
||||
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
||||
verbose=opts.verbose,
|
||||
encoding=opts.input_encoding)\
|
||||
[0 if opts.breadth_first else 1]
|
||||
if opts.verbose:
|
||||
log.debug('\tFound files...')
|
||||
for f in filelist:
|
||||
log.debug('\t\t', f)
|
||||
return opf, filelist
|
||||
|
||||
|
||||
class HTMLInput(InputFormatPlugin):
|
||||
|
||||
name = 'HTML Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert HTML and OPF files to an OEB'
|
||||
file_types = set(['opf', 'html', 'htm', 'xhtml', 'xhtm'])
|
||||
|
||||
options = set([
|
||||
OptionRecommendation(name='breadth_first',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Traverse links in HTML files breadth first. Normally, '
|
||||
'they are traversed depth first.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='max_levels',
|
||||
recommended_value=5, level=OptionRecommendation.LOW,
|
||||
help=_('Maximum levels of recursion when following links in '
|
||||
'HTML files. Must be non-negative. 0 implies that no '
|
||||
'links in the root HTML file are followed. Default is '
|
||||
'%default.'
|
||||
)
|
||||
),
|
||||
|
||||
])
|
||||
|
||||
def convert(self, stream, opts, file_ext, log,
|
||||
accelerators):
|
||||
basedir = os.getcwd()
|
||||
if hasattr(stream, 'name'):
|
||||
basedir = os.path.dirname(stream.name)
|
||||
if file_ext == 'opf':
|
||||
opf = OPF(stream, basedir)
|
||||
filelist = opf_traverse(opf, verbose=opts.verbose,
|
||||
encoding=opts.input_encoding)
|
||||
mi = MetaInformation(opf)
|
||||
else:
|
||||
opf, filelist = get_filelist(stream.name, basedir, opts, log)
|
||||
mi = MetaInformation(opf)
|
||||
mi.smart_update(get_metadata(stream, 'html'))
|
||||
|
||||
mi = OPFCreator(os.getcwdu(), mi)
|
||||
mi.guide = None
|
||||
entries = [(f.path, 'application/xhtml+xml') for f in filelist]
|
||||
mi.create_manifest(entries)
|
||||
mi.create_spine([f.path for f in filelist])
|
||||
|
||||
tocbuf = cStringIO.StringIO()
|
||||
mi.render(open('metadata.opf', 'wb'), tocbuf, 'toc.ncx')
|
||||
toc = tocbuf.getvalue()
|
||||
if toc:
|
||||
open('toc.ncx', 'wb').write(toc)
|
||||
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
return create_oebbook(log, os.path.abspath('metadata.opf'))
|
||||
|
||||
|
||||
|
||||
|
@ -683,26 +683,6 @@ class OPF(object):
|
||||
|
||||
return property(fget=fget, fset=fset)
|
||||
|
||||
@dynamic_property
|
||||
def title_sort(self):
|
||||
|
||||
def fget(self):
|
||||
matches = self.title_path(self.metadata)
|
||||
if matches:
|
||||
for match in matches:
|
||||
ans = match.get('{%s}file-as'%self.NAMESPACES['opf'], None)
|
||||
if not ans:
|
||||
ans = match.get('file-as', None)
|
||||
if ans:
|
||||
return ans
|
||||
|
||||
def fset(self, val):
|
||||
matches = self.title_path(self.metadata)
|
||||
if matches:
|
||||
matches[0].set('file-as', unicode(val))
|
||||
|
||||
return property(fget=fget, fset=fset)
|
||||
|
||||
@dynamic_property
|
||||
def tags(self):
|
||||
|
||||
@ -943,9 +923,10 @@ class OPFCreator(MetaInformation):
|
||||
from calibre.resources import opf_template
|
||||
from calibre.utils.genshi.template import MarkupTemplate
|
||||
template = MarkupTemplate(opf_template)
|
||||
toc = getattr(self, 'toc', None)
|
||||
if self.manifest:
|
||||
self.manifest.set_basedir(self.base_path)
|
||||
if ncx_manifest_entry is not None:
|
||||
if ncx_manifest_entry is not None and toc is not None:
|
||||
if not os.path.isabs(ncx_manifest_entry):
|
||||
ncx_manifest_entry = os.path.join(self.base_path, ncx_manifest_entry)
|
||||
remove = [i for i in self.manifest if i.id == 'ncx']
|
||||
@ -965,7 +946,6 @@ class OPFCreator(MetaInformation):
|
||||
opf = template.generate(__appname__=__appname__, mi=self, __version__=__version__).render('xml')
|
||||
opf_stream.write(opf)
|
||||
opf_stream.flush()
|
||||
toc = getattr(self, 'toc', None)
|
||||
if toc is not None and ncx_stream is not None:
|
||||
toc.render(ncx_stream, self.application_id)
|
||||
ncx_stream.flush()
|
||||
@ -1030,19 +1010,8 @@ class OPFTest(unittest.TestCase):
|
||||
self.opf.smart_update(MetaInformation(self.opf))
|
||||
self.testReading()
|
||||
|
||||
def testCreator(self):
|
||||
opf = OPFCreator(os.getcwd(), self.opf)
|
||||
buf = cStringIO.StringIO()
|
||||
opf.render(buf)
|
||||
raw = buf.getvalue()
|
||||
self.testReading(opf=OPF(cStringIO.StringIO(raw), os.getcwd()))
|
||||
|
||||
def testSmartUpdate(self):
|
||||
self.opf.smart_update(self.opf)
|
||||
self.testReading()
|
||||
|
||||
def suite():
|
||||
return unittest.TestLoader().loadTestsFromTestCase(OPFTest)
|
||||
|
||||
def test():
|
||||
unittest.TextTestRunner(verbosity=2).run(suite())
|
||||
unittest.TextTestRunner(verbosity=2).run(suite())
|
||||
|
@ -29,5 +29,5 @@ class MOBIInput(InputFormatPlugin):
|
||||
with open(f, 'wb') as q:
|
||||
q.write(html.tostring(root, encoding='utf-8', method='xml',
|
||||
include_meta_content_type=False))
|
||||
accelerators['pagebreaks'] = {f: '//div[@class="mbp_pagebreak"]'}
|
||||
accelerators['pagebreaks'] = {f: '//*[@class="mbp_pagebreak"]'}
|
||||
return mr.created_opf_path
|
||||
|
@ -522,7 +522,7 @@ class MobiReader(object):
|
||||
else:
|
||||
raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type))
|
||||
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
|
||||
self.mobi_html = self.mobi_html.replace('\r ', '\n\n ')
|
||||
self.mobi_html = self.mobi_html.replace('\r ', '\n\n').replace('\0', '')
|
||||
return processed_records
|
||||
|
||||
|
||||
|
@ -151,7 +151,7 @@ def resolve_base_href(root):
|
||||
return
|
||||
make_links_absolute(root, base_href, resolve_base_href=False)
|
||||
|
||||
def rewrite_links(root, link_repl_func, resolve_base_href=True):
|
||||
def rewrite_links(root, link_repl_func, resolve_base_href=False):
|
||||
'''
|
||||
Rewrite all the links in the document. For each link
|
||||
``link_repl_func(link)`` will be called, and the return value
|
||||
|
@ -6,9 +6,16 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, shutil
|
||||
import os
|
||||
from urllib import unquote as urlunquote
|
||||
from functools import partial
|
||||
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS
|
||||
from lxml import etree
|
||||
import cssutils
|
||||
|
||||
from calibre.constants import islinux
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS, urlnormalize, urldefrag, \
|
||||
rewrite_links
|
||||
|
||||
class Package(object):
|
||||
|
||||
@ -29,18 +36,69 @@ class Package(object):
|
||||
self.new_base_path = os.path.abspath(base)
|
||||
|
||||
def rewrite_links_in(self, item):
|
||||
new_items = []
|
||||
return new_items
|
||||
base = os.path.join(self.new_base_path, *item.href.split('/'))
|
||||
base = os.path.dirname(base)
|
||||
|
||||
if etree.iselement(item.data):
|
||||
self.rewrite_links_in_xml(item.data, base)
|
||||
elif hasattr(item.data, 'cssText'):
|
||||
self.rewrite_links_in_css(item.data, base)
|
||||
|
||||
def link_replacer(self, link_, base=''):
|
||||
link = urlnormalize(link_)
|
||||
link, frag = urldefrag(link)
|
||||
link = urlunquote(link).replace('/', os.sep)
|
||||
if base and not os.path.isabs(link):
|
||||
link = os.path.join(base, link)
|
||||
link = os.path.abspath(link)
|
||||
if not islinux:
|
||||
link = link.lower()
|
||||
if link not in self.map:
|
||||
return link_
|
||||
nlink = os.path.relpath(self.map[link], base)
|
||||
if frag:
|
||||
nlink = '#'.join(nlink, frag)
|
||||
return nlink.replace(os.sep, '/')
|
||||
|
||||
def rewrite_links_in_css(self, sheet, base):
|
||||
repl = partial(self.link_replacer, base=base)
|
||||
cssutils.replaceUrls(sheet, repl)
|
||||
|
||||
def rewrite_links_in_xml(self, root, base):
|
||||
repl = partial(self.link_replacer, base=base)
|
||||
rewrite_links(root, repl)
|
||||
|
||||
def move_manifest_item(self, item):
|
||||
item.data # Make sure the data has been loaded and cached
|
||||
old_abspath = os.path.join(self.old_base_path, *item.href.split('/'))
|
||||
bname = item.href.split('/')[-1]
|
||||
new_href = 'content/' + \
|
||||
('resources/' if item.media_type in OEB_DOCS else '')+bname
|
||||
old_abspath = os.path.join(self.old_base_path,
|
||||
*(urldefrag(item.href)[0].split('/')))
|
||||
old_abspath = os.path.abspath(old_abspath)
|
||||
bname = item.href.split('/')[-1].partition('#')[0]
|
||||
new_href = 'content/resources/'
|
||||
if item.media_type in OEB_DOCS:
|
||||
new_href = 'content/'
|
||||
elif item.href.lower().endswith('.ncx'):
|
||||
new_href = ''
|
||||
new_href += bname
|
||||
|
||||
new_abspath = os.path.join(self.new_base_path, *new_href.split('/'))
|
||||
new_abspath = os.path.abspath(new_abspath)
|
||||
item.href = new_href
|
||||
if not islinux:
|
||||
old_abspath, new_abspath = old_abspath.lower(), new_abspath.lower()
|
||||
if old_abspath != new_abspath:
|
||||
self.map[old_abspath] = new_abspath
|
||||
|
||||
def rewrite_links_in_toc(self, toc):
|
||||
if toc.href:
|
||||
toc.href = self.link_replacer(toc.href, base=self.new_base_path)
|
||||
|
||||
for x in toc:
|
||||
self.rewrite_links_in_toc(x)
|
||||
|
||||
def __call__(self, oeb, context):
|
||||
self.map = {}
|
||||
self.log = self.oeb.log
|
||||
self.old_base_path = os.path.abspath(oeb.container.rootdir)
|
||||
|
||||
for item in self.oeb.manifest:
|
||||
@ -49,4 +107,9 @@ class Package(object):
|
||||
for item in self.oeb.manifest:
|
||||
self.rewrite_links_in(item)
|
||||
|
||||
if getattr(oeb.toc, 'nodes', False):
|
||||
self.rewrite_links_in_toc(oeb.toc)
|
||||
|
||||
if hasattr(oeb, 'guide'):
|
||||
for ref in oeb.guide.values():
|
||||
ref.href = self.link_replacer(ref.href, base=self.new_base_path)
|
||||
|
@ -6,11 +6,12 @@ from __future__ import with_statement
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
|
||||
from itertools import chain
|
||||
from urlparse import urldefrag
|
||||
|
||||
import cssutils
|
||||
|
||||
from calibre.ebooks.oeb.base import CSS_MIME, OEB_DOCS
|
||||
from calibre.ebooks.oeb.base import LINK_SELECTORS, CSSURL_RE
|
||||
from calibre.ebooks.oeb.base import urlnormalize
|
||||
from calibre.ebooks.oeb.base import urlnormalize, iterlinks
|
||||
|
||||
class ManifestTrimmer(object):
|
||||
@classmethod
|
||||
@ -44,16 +45,15 @@ class ManifestTrimmer(object):
|
||||
if (item.media_type in OEB_DOCS or
|
||||
item.media_type[-4:] in ('/xml', '+xml')) and \
|
||||
item.data is not None:
|
||||
hrefs = [sel(item.data) for sel in LINK_SELECTORS]
|
||||
for href in chain(*hrefs):
|
||||
hrefs = [r[2] for r in iterlinks(item.data)]
|
||||
for href in hrefs:
|
||||
href = item.abshref(urlnormalize(href))
|
||||
if href in oeb.manifest.hrefs:
|
||||
found = oeb.manifest.hrefs[href]
|
||||
if found not in used:
|
||||
new.add(found)
|
||||
elif item.media_type == CSS_MIME:
|
||||
for match in CSSURL_RE.finditer(item.data.cssText):
|
||||
href = match.group('url')
|
||||
for href in cssutils.getUrls(item.data):
|
||||
href = item.abshref(urlnormalize(href))
|
||||
if href in oeb.manifest.hrefs:
|
||||
found = oeb.manifest.hrefs[href]
|
||||
|
@ -22,9 +22,6 @@ entry_points = {
|
||||
'web2disk = calibre.web.fetch.simple:main',
|
||||
'feeds2disk = calibre.web.feeds.main:main',
|
||||
'calibre-server = calibre.library.server:main',
|
||||
'feeds2lrf = calibre.ebooks.lrf.feeds.convert_from:main',
|
||||
'feeds2epub = calibre.ebooks.epub.from_feeds:main',
|
||||
'feeds2mobi = calibre.ebooks.mobi.from_feeds:main',
|
||||
'web2lrf = calibre.ebooks.lrf.web.convert_from:main',
|
||||
'lrf2lrs = calibre.ebooks.lrf.lrfparser:main',
|
||||
'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main',
|
||||
@ -154,10 +151,7 @@ def setup_completion(fatal_errors):
|
||||
from calibre.ebooks.lrf.pdf.reflow import option_parser as pdfhtmlop
|
||||
from calibre.web.feeds.main import option_parser as feeds2disk
|
||||
from calibre.web.feeds.recipes import titles as feed_titles
|
||||
from calibre.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf
|
||||
from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop
|
||||
from calibre.ebooks.epub.from_feeds import option_parser as feeds2epub
|
||||
from calibre.ebooks.mobi.from_feeds import option_parser as feeds2mobi
|
||||
from calibre.ebooks.epub.from_comic import option_parser as comic2epub
|
||||
from calibre.ebooks.metadata.fetch import option_parser as fem_op
|
||||
from calibre.gui2.main import option_parser as guiop
|
||||
@ -192,9 +186,6 @@ def setup_completion(fatal_errors):
|
||||
f.write(opts_and_exts('comic2mobi', comic2epub, ['cbz', 'cbr']))
|
||||
f.write(opts_and_exts('comic2pdf', comic2epub, ['cbz', 'cbr']))
|
||||
f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
|
||||
f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
|
||||
f.write(opts_and_words('feeds2epub', feeds2epub, feed_titles))
|
||||
f.write(opts_and_words('feeds2mobi', feeds2mobi, feed_titles))
|
||||
f.write(opts_and_words('fetch-ebook-metadata', fem_op, []))
|
||||
f.write(opts_and_words('calibre-smtp', smtp_op, []))
|
||||
f.write('''
|
||||
|
Loading…
x
Reference in New Issue
Block a user