mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Sync to pluginize
This commit is contained in:
commit
8fd446090c
@ -122,8 +122,9 @@ class InputFormatPlugin(Plugin):
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
'''
|
||||
This method must be implemented in sub-classes. It must return
|
||||
the path to the created OPF file. All output should be contained in
|
||||
the current directory. If this plugin creates files outside the current
|
||||
the path to the created OPF file or an :class:`OEBBook` instance.
|
||||
All output should be contained in the current directory.
|
||||
If this plugin creates files outside the current
|
||||
directory they must be deleted/marked for deletion before this method
|
||||
returns.
|
||||
|
||||
|
@ -299,21 +299,15 @@ OptionRecommendation(name='language',
|
||||
|
||||
# Create an OEBBook from the input file. The input plugin does all the
|
||||
# heavy lifting.
|
||||
from calibre.ebooks.oeb.reader import OEBReader
|
||||
from calibre.ebooks.oeb.base import OEBBook
|
||||
accelerators = {}
|
||||
|
||||
tdir = PersistentTemporaryDirectory('_plumber')
|
||||
|
||||
opfpath = self.input_plugin(open(self.input, 'rb'), self.opts,
|
||||
self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts,
|
||||
self.input_fmt, self.log,
|
||||
accelerators, tdir)
|
||||
html_preprocessor = HTMLPreProcessor()
|
||||
self.reader = OEBReader()
|
||||
self.oeb = OEBBook(self.log, html_preprocessor=html_preprocessor)
|
||||
# Read OEB Book into OEBBook
|
||||
self.log.info('Parsing all content...')
|
||||
self.reader(self.oeb, opfpath)
|
||||
if not hasattr(self.oeb, 'manifest'):
|
||||
self.oeb = create_oebbook(self.log, self.oeb)
|
||||
|
||||
self.opts.source = self.opts.input_profile
|
||||
self.opts.dest = self.opts.output_profile
|
||||
@ -340,7 +334,20 @@ OptionRecommendation(name='language',
|
||||
trimmer(self.oeb, self.opts)
|
||||
|
||||
self.log.info('Creating %s...'%self.output_plugin.name)
|
||||
self.output_plugin.convert(self.oeb, self.output, self.input_plugin, self.opts,
|
||||
self.log)
|
||||
self.output_plugin.convert(self.oeb, self.output, self.input_plugin,
|
||||
self.opts, self.log)
|
||||
|
||||
def create_oebbook(log, opfpath):
|
||||
'''
|
||||
Create an OEBBook from an OPF file.
|
||||
'''
|
||||
from calibre.ebooks.oeb.reader import OEBReader
|
||||
from calibre.ebooks.oeb.base import OEBBook
|
||||
html_preprocessor = HTMLPreProcessor()
|
||||
reader = OEBReader()
|
||||
oeb = OEBBook(log, html_preprocessor=html_preprocessor)
|
||||
# Read OEB Book into OEBBook
|
||||
log.info('Parsing all content...')
|
||||
reader(oeb, opfpath)
|
||||
return oeb
|
||||
|
||||
|
@ -10,23 +10,23 @@ import sys, textwrap, re, os, uuid
|
||||
from itertools import cycle
|
||||
from calibre.utils.config import Config, StringConfig
|
||||
from calibre.utils.zipfile import ZipFile, ZIP_STORED
|
||||
from calibre.ebooks.html import config as common_config, tostring
|
||||
from calibre.ebooks.html import tostring
|
||||
from lxml import etree
|
||||
|
||||
class DefaultProfile(object):
|
||||
|
||||
|
||||
flow_size = sys.maxint
|
||||
screen_size = None
|
||||
remove_special_chars = False
|
||||
remove_object_tags = False
|
||||
|
||||
|
||||
class PRS505(DefaultProfile):
|
||||
|
||||
|
||||
flow_size = 270000
|
||||
screen_size = (590, 765)
|
||||
remove_special_chars = re.compile(u'[\u200b\u00ad]')
|
||||
remove_object_tags = True
|
||||
|
||||
|
||||
|
||||
PROFILES = {
|
||||
'PRS505' : PRS505,
|
||||
@ -64,11 +64,11 @@ def config(defaults=None, name='epub'):
|
||||
c = Config(name, desc)
|
||||
else:
|
||||
c = StringConfig(defaults, desc)
|
||||
|
||||
|
||||
c.update(common_config())
|
||||
c.remove_opt('output')
|
||||
c.remove_opt('zip')
|
||||
|
||||
|
||||
c.add_opt('output', ['-o', '--output'], default=None,
|
||||
help=_('The output EPUB file. If not specified, it is '
|
||||
'derived from the input file name.'))
|
||||
@ -81,22 +81,22 @@ def config(defaults=None, name='epub'):
|
||||
help=_('Either the path to a CSS stylesheet or raw CSS. '
|
||||
'This CSS will override any existing CSS '
|
||||
'declarations in the source files.'))
|
||||
structure = c.add_group('structure detection',
|
||||
structure = c.add_group('structure detection',
|
||||
_('Control auto-detection of document structure.'))
|
||||
structure('chapter', ['--chapter'],
|
||||
structure('chapter', ['--chapter'],
|
||||
default="//*[re:match(name(), 'h[1-2]') and "
|
||||
"re:test(., 'chapter|book|section|part', 'i')] | "
|
||||
"//*[@class = 'chapter']",
|
||||
help=_('''\
|
||||
An XPath expression to detect chapter titles. The default is to consider <h1> or
|
||||
<h2> tags that contain the words "chapter","book","section" or "part" as chapter titles as
|
||||
well as any tags that have class="chapter".
|
||||
<h2> tags that contain the words "chapter","book","section" or "part" as chapter titles as
|
||||
well as any tags that have class="chapter".
|
||||
The expression used must evaluate to a list of elements. To disable chapter detection,
|
||||
use the expression "/". See the XPath Tutorial in the calibre User Manual for further
|
||||
help on using this feature.
|
||||
''').replace('\n', ' '))
|
||||
structure('chapter_mark', ['--chapter-mark'], choices=['pagebreak', 'rule', 'both', 'none'],
|
||||
default='pagebreak',
|
||||
default='pagebreak',
|
||||
help=_('Specify how to mark detected chapters. A value of '
|
||||
'"pagebreak" will insert page breaks before chapters. '
|
||||
'A value of "rule" will insert a line before chapters. '
|
||||
@ -129,13 +129,13 @@ help on using this feature.
|
||||
help=_('XPath expression to find the name of each page in the '
|
||||
'pagination map relative to its boundary element. '
|
||||
'Default is to number all pages staring with 1.'))
|
||||
toc = c.add_group('toc',
|
||||
toc = c.add_group('toc',
|
||||
_('''\
|
||||
Control the automatic generation of a Table of Contents. If an OPF file is detected
|
||||
and it specifies a Table of Contents, then that will be used rather than trying
|
||||
to auto-generate a Table of Contents.
|
||||
''').replace('\n', ' '))
|
||||
toc('max_toc_links', ['--max-toc-links'], default=50,
|
||||
toc('max_toc_links', ['--max-toc-links'], default=50,
|
||||
help=_('Maximum number of links to insert into the TOC. Set to 0 '
|
||||
'to disable. Default is: %default. Links are only added to the '
|
||||
'TOC if less than the --toc-threshold number of chapters were detected.'))
|
||||
@ -166,15 +166,15 @@ to auto-generate a Table of Contents.
|
||||
help=_('Normally, if the source file already has a Table of Contents, '
|
||||
'it is used in preference to the auto-generated one. '
|
||||
'With this option, the auto-generated one is always used.'))
|
||||
|
||||
|
||||
layout = c.add_group('page layout', _('Control page layout'))
|
||||
layout('margin_top', ['--margin-top'], default=5.0,
|
||||
layout('margin_top', ['--margin-top'], default=5.0,
|
||||
help=_('Set the top margin in pts. Default is %default'))
|
||||
layout('margin_bottom', ['--margin-bottom'], default=5.0,
|
||||
layout('margin_bottom', ['--margin-bottom'], default=5.0,
|
||||
help=_('Set the bottom margin in pts. Default is %default'))
|
||||
layout('margin_left', ['--margin-left'], default=5.0,
|
||||
layout('margin_left', ['--margin-left'], default=5.0,
|
||||
help=_('Set the left margin in pts. Default is %default'))
|
||||
layout('margin_right', ['--margin-right'], default=5.0,
|
||||
layout('margin_right', ['--margin-right'], default=5.0,
|
||||
help=_('Set the right margin in pts. Default is %default'))
|
||||
layout('base_font_size2', ['--base-font-size'], default=12.0,
|
||||
help=_('The base font size in pts. Default is %defaultpt. '
|
||||
@ -195,12 +195,12 @@ to auto-generate a Table of Contents.
|
||||
'This is only neccessary if the HTML files contain CSS that '
|
||||
'uses sibling selectors. Enabling this greatly slows down '
|
||||
'processing of large HTML files.'))
|
||||
|
||||
|
||||
c.add_opt('show_opf', ['--show-opf'], default=False, group='debug',
|
||||
help=_('Print generated OPF file to stdout'))
|
||||
c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug',
|
||||
help=_('Print generated NCX file to stdout'))
|
||||
c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug',
|
||||
c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug',
|
||||
default=False,
|
||||
help=_('Keep intermediate files during processing by html2epub'))
|
||||
c.add_opt('extract_to', ['--extract-to'], group='debug', default=None,
|
||||
|
@ -14,7 +14,7 @@ from lxml.cssselect import CSSSelector
|
||||
from lxml import etree
|
||||
from lxml.html import HtmlElement
|
||||
|
||||
from calibre.ebooks.html import fromstring
|
||||
from calibre.ebooks.html_old import fromstring
|
||||
from calibre.ebooks.epub import rules
|
||||
from cssutils import CSSParser
|
||||
|
||||
@ -24,7 +24,7 @@ absolute_size = r'(?P<abs>(x?x-)?(small|large)|medium)'
|
||||
relative_size = r'(?P<rel>smaller|larger)'
|
||||
|
||||
font_size_pat = re.compile('|'.join((relative_size, absolute_size, length)), re.I)
|
||||
line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num))
|
||||
line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num))
|
||||
|
||||
PTU = {
|
||||
'in' : 72.,
|
||||
@ -37,12 +37,12 @@ PTU = {
|
||||
DEFAULT_FONT_SIZE = 12
|
||||
|
||||
class Rationalizer(object):
|
||||
|
||||
|
||||
@classmethod
|
||||
def specificity(cls, s):
|
||||
'''Map CSS specificity tuple to a single integer'''
|
||||
return sum([10**(4-i) + x for i,x in enumerate(s)])
|
||||
|
||||
return sum([10**(4-i) + x for i,x in enumerate(s)])
|
||||
|
||||
@classmethod
|
||||
def compute_font_size(cls, elem):
|
||||
'''
|
||||
@ -59,7 +59,7 @@ class Rationalizer(object):
|
||||
elem.computed_font_size = sfs(parent.computed_font_size)
|
||||
else:
|
||||
elem.computed_font_size = sfs
|
||||
|
||||
|
||||
@classmethod
|
||||
def calculate_font_size(cls, style):
|
||||
'Return font size in pts from style object. For relative units returns a callable'
|
||||
@ -69,7 +69,7 @@ class Rationalizer(object):
|
||||
fs = match.group()
|
||||
if style.fontSize:
|
||||
fs = style.fontSize
|
||||
|
||||
|
||||
match = font_size_pat.search(fs)
|
||||
if match is None:
|
||||
return None
|
||||
@ -89,8 +89,8 @@ class Rationalizer(object):
|
||||
return 12 * x
|
||||
if match.get('zero', False):
|
||||
return 0.
|
||||
return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8)
|
||||
|
||||
return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8)
|
||||
|
||||
@classmethod
|
||||
def resolve_rules(cls, stylesheets):
|
||||
for sheet in stylesheets:
|
||||
@ -104,12 +104,12 @@ class Rationalizer(object):
|
||||
if font_size is not None:
|
||||
for s in r.selectorList:
|
||||
sheet.fs_rules.append([CSSSelector(s.selectorText), font_size])
|
||||
orig = line_height_pat.search(r.style.lineHeight)
|
||||
orig = line_height_pat.search(r.style.lineHeight)
|
||||
if orig is not None:
|
||||
for s in r.selectorList:
|
||||
sheet.lh_rules.append([CSSSelector(s.selectorText), float(orig.group(1)) * PTU[orig.group(2).lower()]])
|
||||
|
||||
|
||||
|
||||
|
||||
@classmethod
|
||||
def apply_font_size_rules(cls, stylesheets, root):
|
||||
'Add a ``specified_font_size`` attribute to every element that has a specified font size'
|
||||
@ -119,7 +119,7 @@ class Rationalizer(object):
|
||||
elems = selector(root)
|
||||
for elem in elems:
|
||||
elem.specified_font_size = font_size
|
||||
|
||||
|
||||
@classmethod
|
||||
def remove_font_size_information(cls, stylesheets):
|
||||
for r in rules(stylesheets):
|
||||
@ -134,17 +134,17 @@ class Rationalizer(object):
|
||||
r.style.removeProperty('font')
|
||||
if line_height_pat.search(r.style.lineHeight) is not None:
|
||||
r.style.removeProperty('line-height')
|
||||
|
||||
|
||||
@classmethod
|
||||
def compute_font_sizes(cls, root, stylesheets, base=12):
|
||||
stylesheets = [s for s in stylesheets if hasattr(s, 'cssText')]
|
||||
cls.apply_font_size_rules(stylesheets, root)
|
||||
|
||||
|
||||
# Compute the effective font size of all tags
|
||||
root.computed_font_size = DEFAULT_FONT_SIZE
|
||||
for elem in root.iter(etree.Element):
|
||||
cls.compute_font_size(elem)
|
||||
|
||||
|
||||
extra_css = {}
|
||||
if base > 0:
|
||||
# Calculate the "base" (i.e. most common) font size
|
||||
@ -157,20 +157,20 @@ class Rationalizer(object):
|
||||
if t: t = t.strip()
|
||||
if t:
|
||||
font_sizes[elem.computed_font_size] += len(t)
|
||||
|
||||
|
||||
t = getattr(elem, 'tail', '')
|
||||
if t: t = t.strip()
|
||||
if t:
|
||||
parent = elem.getparent()
|
||||
if parent.tag not in IGNORE:
|
||||
font_sizes[parent.computed_font_size] += len(t)
|
||||
|
||||
|
||||
try:
|
||||
most_common = max(font_sizes.items(), key=operator.itemgetter(1))[0]
|
||||
scale = base/most_common if most_common > 0 else 1.
|
||||
except ValueError:
|
||||
scale = 1.
|
||||
|
||||
|
||||
# rescale absolute line-heights
|
||||
counter = 0
|
||||
for sheet in stylesheets:
|
||||
@ -181,17 +181,17 @@ class Rationalizer(object):
|
||||
if not extra_css.has_key(elem.get('id')):
|
||||
extra_css[elem.get('id')] = []
|
||||
extra_css[elem.get('id')].append('line-height:%fpt'%(lh*scale))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Rescale all computed font sizes
|
||||
for elem in body.iter(etree.Element):
|
||||
if isinstance(elem, HtmlElement):
|
||||
elem.computed_font_size *= scale
|
||||
|
||||
# Remove all font size specifications from the last stylesheet
|
||||
|
||||
# Remove all font size specifications from the last stylesheet
|
||||
cls.remove_font_size_information(stylesheets[-1:])
|
||||
|
||||
|
||||
# Create the CSS to implement the rescaled font sizes
|
||||
for elem in body.iter(etree.Element):
|
||||
cfs, pcfs = map(operator.attrgetter('computed_font_size'), (elem, elem.getparent()))
|
||||
@ -201,12 +201,12 @@ class Rationalizer(object):
|
||||
if not extra_css.has_key(elem.get('id')):
|
||||
extra_css[elem.get('id')] = []
|
||||
extra_css[elem.get('id')].append('font-size: %f%%'%(100*(cfs/pcfs)))
|
||||
|
||||
|
||||
css = CSSParser(loglevel=logging.ERROR).parseString('')
|
||||
for id, r in extra_css.items():
|
||||
css.add('#%s {%s}'%(id, ';'.join(r)))
|
||||
return css
|
||||
|
||||
|
||||
@classmethod
|
||||
def rationalize(cls, stylesheets, root, opts):
|
||||
logger = logging.getLogger('html2epub')
|
||||
@ -229,7 +229,7 @@ class Rationalizer(object):
|
||||
################################################################################
|
||||
|
||||
class FontTest(unittest.TestCase):
|
||||
|
||||
|
||||
def setUp(self):
|
||||
from calibre.ebooks.epub import config
|
||||
self.opts = config(defaults='').parse()
|
||||
@ -246,10 +246,10 @@ class FontTest(unittest.TestCase):
|
||||
<p id="p2">Some other <span class="it">text</span>.</p>
|
||||
<p id="longest">The longest piece of single font size text in this entire file. Used to test resizing.</p>
|
||||
</body>
|
||||
</html>
|
||||
</html>
|
||||
'''
|
||||
self.root = fromstring(self.html)
|
||||
|
||||
|
||||
def do_test(self, css, base=DEFAULT_FONT_SIZE, scale=1):
|
||||
root1 = copy.deepcopy(self.root)
|
||||
root1.computed_font_size = DEFAULT_FONT_SIZE
|
||||
@ -262,39 +262,39 @@ class FontTest(unittest.TestCase):
|
||||
for elem in root2.iter(etree.Element):
|
||||
Rationalizer.compute_font_size(elem)
|
||||
for e1, e2 in zip(root1.xpath('//body')[0].iter(etree.Element), root2.xpath('//body')[0].iter(etree.Element)):
|
||||
self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size,
|
||||
self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size,
|
||||
msg='Computed font sizes for %s not equal. Original: %f Processed: %f'%\
|
||||
(root1.getroottree().getpath(e1), e1.computed_font_size, e2.computed_font_size))
|
||||
return stylesheet2.cssText
|
||||
|
||||
|
||||
def testStripping(self):
|
||||
'Test that any original entries are removed from the CSS'
|
||||
css = 'p { font: bold 10px italic smaller; font-size: x-large} \na { font-size: 0 }'
|
||||
css = CSSParser(loglevel=logging.ERROR).parseString(css)
|
||||
Rationalizer.compute_font_sizes(copy.deepcopy(self.root), [css])
|
||||
self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''),
|
||||
self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''),
|
||||
'p{font:bolditalic}')
|
||||
|
||||
|
||||
def testIdentity(self):
|
||||
'Test that no unnecessary font size changes are made'
|
||||
extra_css = self.do_test('div {font-size:12pt} \nspan {font-size:100%}')
|
||||
self.assertEqual(extra_css.strip(), '')
|
||||
|
||||
|
||||
def testRelativization(self):
|
||||
'Test conversion of absolute to relative sizes'
|
||||
self.do_test('#p1 {font: 24pt} b {font: 12pt} .it {font: 48pt} #p2 {font: 100%}')
|
||||
|
||||
|
||||
def testResizing(self):
|
||||
'Test resizing of fonts'
|
||||
self.do_test('#longest {font: 24pt} .it {font:20pt; line-height:22pt}')
|
||||
|
||||
|
||||
|
||||
def suite():
|
||||
return unittest.TestLoader().loadTestsFromTestCase(FontTest)
|
||||
|
||||
|
||||
def test():
|
||||
unittest.TextTestRunner(verbosity=2).run(suite())
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(test())
|
||||
|
||||
sys.exit(test())
|
||||
|
||||
|
@ -38,7 +38,7 @@ from lxml.etree import XPath
|
||||
from lxml import html, etree
|
||||
from PyQt4.Qt import QApplication, QPixmap
|
||||
|
||||
from calibre.ebooks.html import Processor, merge_metadata, get_filelist,\
|
||||
from calibre.ebooks.html_old import Processor, merge_metadata, get_filelist,\
|
||||
opf_traverse, create_metadata, rebase_toc, Link, parser
|
||||
from calibre.ebooks.epub import config as common_config, tostring
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
|
@ -16,7 +16,7 @@ from calibre.ebooks.epub import config
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.html import create_dir
|
||||
from calibre.ebooks.html_old import create_dir
|
||||
from calibre.utils.zipfile import safe_replace, ZipFile
|
||||
from calibre.utils.config import DynamicConfig
|
||||
|
||||
|
@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
|
||||
Split the flows in an epub file to conform to size limitations.
|
||||
'''
|
||||
|
||||
import os, math, logging, functools, collections, re, copy, sys
|
||||
import os, math, functools, collections, re, copy, sys
|
||||
|
||||
from lxml.etree import XPath as _XPath
|
||||
from lxml import etree, html
|
||||
@ -24,16 +24,16 @@ SPLIT_ATTR = 'cs'
|
||||
SPLIT_POINT_ATTR = 'csp'
|
||||
|
||||
class SplitError(ValueError):
|
||||
|
||||
|
||||
def __init__(self, path, root):
|
||||
size = len(tostring(root))/1024.
|
||||
ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')%
|
||||
ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')%
|
||||
(os.path.basename(path), size))
|
||||
|
||||
|
||||
|
||||
|
||||
class Splitter(object):
|
||||
|
||||
|
||||
def __init__(self, path, opts, stylesheet_map, opf):
|
||||
self.setup_cli_handler(opts.verbose)
|
||||
self.path = path
|
||||
@ -44,10 +44,10 @@ class Splitter(object):
|
||||
self.orig_size = os.stat(content(path)).st_size
|
||||
self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.)
|
||||
root = html.fromstring(open(content(path)).read())
|
||||
|
||||
|
||||
self.page_breaks, self.trees = [], []
|
||||
self.split_size = 0
|
||||
|
||||
|
||||
# Split on page breaks
|
||||
self.splitting_on_page_breaks = True
|
||||
if not opts.dont_split_on_page_breaks:
|
||||
@ -59,7 +59,7 @@ class Splitter(object):
|
||||
else:
|
||||
self.trees = [root.getroottree()]
|
||||
trees = list(self.trees)
|
||||
|
||||
|
||||
# Split any remaining over-sized trees
|
||||
self.splitting_on_page_breaks = False
|
||||
if self.opts.profile.flow_size < sys.maxint:
|
||||
@ -67,7 +67,7 @@ class Splitter(object):
|
||||
self.log_info('\tLooking for large trees...')
|
||||
for i, tree in enumerate(list(trees)):
|
||||
self.trees = []
|
||||
size = len(tostring(tree.getroot()))
|
||||
size = len(tostring(tree.getroot()))
|
||||
if size > self.opts.profile.flow_size:
|
||||
lt_found = True
|
||||
try:
|
||||
@ -81,7 +81,7 @@ class Splitter(object):
|
||||
trees[i:i+1] = list(self.trees)
|
||||
if not lt_found:
|
||||
self.log_info('\tNo large trees found')
|
||||
|
||||
|
||||
self.trees = trees
|
||||
self.was_split = len(self.trees) > 1
|
||||
if self.was_split:
|
||||
@ -91,17 +91,17 @@ class Splitter(object):
|
||||
for f in self.files:
|
||||
self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
|
||||
self.fix_opf(opf)
|
||||
|
||||
|
||||
self.trees = None
|
||||
|
||||
|
||||
|
||||
|
||||
def split_text(self, text, root, size):
|
||||
self.log_debug('\t\t\tSplitting text of length: %d'%len(text))
|
||||
rest = text.replace('\r', '')
|
||||
parts = re.split('\n\n', rest)
|
||||
self.log_debug('\t\t\t\tFound %d parts'%len(parts))
|
||||
if max(map(len, parts)) > size:
|
||||
raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root)
|
||||
raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root)
|
||||
ans = []
|
||||
buf = ''
|
||||
for part in parts:
|
||||
@ -111,8 +111,8 @@ class Splitter(object):
|
||||
ans.append(buf)
|
||||
buf = part
|
||||
return ans
|
||||
|
||||
|
||||
|
||||
|
||||
def split_to_size(self, tree):
|
||||
self.log_debug('\t\tSplitting...')
|
||||
root = tree.getroot()
|
||||
@ -134,7 +134,7 @@ class Splitter(object):
|
||||
p = pre.getparent()
|
||||
i = p.index(pre)
|
||||
p[i:i+1] = new_pres
|
||||
|
||||
|
||||
split_point, before = self.find_split_point(root)
|
||||
if split_point is None or self.split_size > 6*self.orig_size:
|
||||
if not self.always_remove:
|
||||
@ -142,7 +142,7 @@ class Splitter(object):
|
||||
'structure preservation. This may cause '
|
||||
'incorrect rendering.'))
|
||||
raise SplitError(self.path, root)
|
||||
|
||||
|
||||
for t in self.do_split(tree, split_point, before):
|
||||
r = t.getroot()
|
||||
if self.is_page_empty(r):
|
||||
@ -151,12 +151,12 @@ class Splitter(object):
|
||||
if size <= self.opts.profile.flow_size:
|
||||
self.trees.append(t)
|
||||
#print tostring(t.getroot(), pretty_print=True)
|
||||
self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)',
|
||||
self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)',
|
||||
len(self.trees), size/1024.)
|
||||
self.split_size += size
|
||||
else:
|
||||
self.split_to_size(t)
|
||||
|
||||
|
||||
def is_page_empty(self, root):
|
||||
body = root.find('body')
|
||||
if body is None:
|
||||
@ -170,14 +170,14 @@ class Splitter(object):
|
||||
if img.get('style', '') != 'display:none':
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def do_split(self, tree, split_point, before):
|
||||
'''
|
||||
Split ``tree`` into a *before* and *after* tree at ``split_point``,
|
||||
preserving tag structure, but not duplicating any text.
|
||||
Split ``tree`` into a *before* and *after* tree at ``split_point``,
|
||||
preserving tag structure, but not duplicating any text.
|
||||
All tags that have had their text and tail
|
||||
removed have the attribute ``calibre_split`` set to 1.
|
||||
|
||||
|
||||
:param before: If True tree is split before split_point, otherwise after split_point
|
||||
:return: before_tree, after_tree
|
||||
'''
|
||||
@ -188,7 +188,7 @@ class Splitter(object):
|
||||
body, body2 = root.body, root2.body
|
||||
split_point = root.xpath(path)[0]
|
||||
split_point2 = root2.xpath(path)[0]
|
||||
|
||||
|
||||
def nix_element(elem, top=True):
|
||||
if self.always_remove:
|
||||
parent = elem.getparent()
|
||||
@ -198,18 +198,18 @@ class Splitter(object):
|
||||
else:
|
||||
index = parent.index(elem)
|
||||
parent[index:index+1] = list(elem.iterchildren())
|
||||
|
||||
|
||||
else:
|
||||
elem.text = u''
|
||||
elem.tail = u''
|
||||
elem.set(SPLIT_ATTR, '1')
|
||||
if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']:
|
||||
elem.set('style', 'display:none')
|
||||
|
||||
|
||||
def fix_split_point(sp):
|
||||
if not self.splitting_on_page_breaks:
|
||||
sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid')
|
||||
|
||||
sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid')
|
||||
|
||||
# Tree 1
|
||||
hit_split_point = False
|
||||
for elem in list(body.iterdescendants(etree.Element)):
|
||||
@ -223,8 +223,8 @@ class Splitter(object):
|
||||
continue
|
||||
if hit_split_point:
|
||||
nix_element(elem)
|
||||
|
||||
|
||||
|
||||
|
||||
# Tree 2
|
||||
hit_split_point = False
|
||||
for elem in list(body2.iterdescendants(etree.Element)):
|
||||
@ -238,17 +238,17 @@ class Splitter(object):
|
||||
continue
|
||||
if not hit_split_point:
|
||||
nix_element(elem, top=False)
|
||||
|
||||
|
||||
return tree, tree2
|
||||
|
||||
|
||||
|
||||
|
||||
def split_on_page_breaks(self, orig_tree):
|
||||
ordered_ids = []
|
||||
for elem in orig_tree.xpath('//*[@id]'):
|
||||
id = elem.get('id')
|
||||
if id in self.page_break_ids:
|
||||
ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
|
||||
|
||||
|
||||
self.trees = []
|
||||
tree = orig_tree
|
||||
for pattern, before in ordered_ids:
|
||||
@ -260,13 +260,13 @@ class Splitter(object):
|
||||
tree = after
|
||||
self.trees.append(tree)
|
||||
self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def find_page_breaks(self, stylesheets, root):
|
||||
'''
|
||||
Find all elements that have either page-break-before or page-break-after set.
|
||||
Populates `self.page_breaks` with id based XPath selectors (for elements that don't
|
||||
Populates `self.page_breaks` with id based XPath selectors (for elements that don't
|
||||
have ids, an id is created).
|
||||
'''
|
||||
page_break_selectors = set([])
|
||||
@ -283,16 +283,16 @@ class Splitter(object):
|
||||
page_break_selectors.add((CSSSelector(rule.selectorText), False))
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
page_breaks = set([])
|
||||
for selector, before in page_break_selectors:
|
||||
for elem in selector(root):
|
||||
elem.pb_before = before
|
||||
page_breaks.add(elem)
|
||||
|
||||
|
||||
for i, elem in enumerate(root.iter()):
|
||||
elem.pb_order = i
|
||||
|
||||
|
||||
page_breaks = list(page_breaks)
|
||||
page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
|
||||
self.page_break_ids = []
|
||||
@ -300,12 +300,12 @@ class Splitter(object):
|
||||
x.set('id', x.get('id', 'calibre_pb_%d'%i))
|
||||
id = x.get('id')
|
||||
self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before))
|
||||
self.page_break_ids.append(id)
|
||||
|
||||
|
||||
self.page_break_ids.append(id)
|
||||
|
||||
|
||||
def find_split_point(self, root):
|
||||
'''
|
||||
Find the tag at which to split the tree rooted at `root`.
|
||||
Find the tag at which to split the tree rooted at `root`.
|
||||
Search order is:
|
||||
* Heading tags
|
||||
* <div> tags
|
||||
@ -314,7 +314,7 @@ class Splitter(object):
|
||||
* <p> tags
|
||||
* <br> tags
|
||||
* <li> tags
|
||||
|
||||
|
||||
We try to split in the "middle" of the file (as defined by tag counts.
|
||||
'''
|
||||
def pick_elem(elems):
|
||||
@ -325,18 +325,18 @@ class Splitter(object):
|
||||
i = int(math.floor(len(elems)/2.))
|
||||
elems[i].set(SPLIT_POINT_ATTR, '1')
|
||||
return elems[i]
|
||||
|
||||
|
||||
for path in (
|
||||
'//*[re:match(name(), "h[1-6]", "i")]',
|
||||
'//*[re:match(name(), "h[1-6]", "i")]',
|
||||
'/html/body/div',
|
||||
'//pre',
|
||||
'//hr',
|
||||
'//hr',
|
||||
'//p',
|
||||
'//div',
|
||||
'//br',
|
||||
'//li',
|
||||
):
|
||||
elems = root.xpath(path,
|
||||
elems = root.xpath(path,
|
||||
namespaces={'re':'http://exslt.org/regular-expressions'})
|
||||
elem = pick_elem(elems)
|
||||
if elem is not None:
|
||||
@ -345,9 +345,9 @@ class Splitter(object):
|
||||
except:
|
||||
continue
|
||||
return elem, True
|
||||
|
||||
|
||||
return None, True
|
||||
|
||||
|
||||
def commit(self):
|
||||
'''
|
||||
Commit all changes caused by the split. This removes the previously
|
||||
@ -357,7 +357,7 @@ class Splitter(object):
|
||||
'''
|
||||
self.anchor_map = collections.defaultdict(lambda :self.base%0)
|
||||
self.files = []
|
||||
|
||||
|
||||
for i, tree in enumerate(self.trees):
|
||||
root = tree.getroot()
|
||||
self.files.append(self.base%i)
|
||||
@ -367,7 +367,7 @@ class Splitter(object):
|
||||
for elem in root.xpath('//*[@%s or @%s]'%(SPLIT_ATTR, SPLIT_POINT_ATTR)):
|
||||
elem.attrib.pop(SPLIT_ATTR, None)
|
||||
elem.attrib.pop(SPLIT_POINT_ATTR, '0')
|
||||
|
||||
|
||||
for current, tree in zip(self.files, self.trees):
|
||||
for a in tree.getroot().xpath('//a[@href]'):
|
||||
href = a.get('href').strip()
|
||||
@ -375,10 +375,10 @@ class Splitter(object):
|
||||
anchor = href[1:]
|
||||
file = self.anchor_map[anchor]
|
||||
if file != current:
|
||||
a.set('href', file+href)
|
||||
a.set('href', file+href)
|
||||
open(content(current), 'wb').\
|
||||
write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print))
|
||||
|
||||
|
||||
os.remove(content(self.path))
|
||||
|
||||
|
||||
@ -391,12 +391,12 @@ class Splitter(object):
|
||||
id_map = {}
|
||||
for item in items:
|
||||
id_map[item.get('id')] = opf.replace_manifest_item(item, new_items)
|
||||
|
||||
|
||||
for id in id_map.keys():
|
||||
opf.replace_spine_items_by_idref(id, id_map[id])
|
||||
|
||||
|
||||
for ref in opf.iterguide():
|
||||
href = ref.get('href', '')
|
||||
href = ref.get('href', '')
|
||||
if href.startswith('content/'+self.path):
|
||||
href = href.split('#')
|
||||
frag = None
|
||||
@ -408,8 +408,8 @@ class Splitter(object):
|
||||
new_file = self.anchor_map[frag]
|
||||
ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag)))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def fix_content_links(html_files, changes, opts):
|
||||
split_files = [f.path for f in changes]
|
||||
anchor_maps = [f.anchor_map for f in changes]
|
||||
@ -420,7 +420,7 @@ def fix_content_links(html_files, changes, opts):
|
||||
files[i:i+1] = changes[j].files
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
|
||||
for htmlfile in files:
|
||||
changed = False
|
||||
root = html.fromstring(open(content(htmlfile), 'rb').read())
|
||||
@ -439,7 +439,7 @@ def fix_content_links(html_files, changes, opts):
|
||||
frag = ('#'+anchor) if anchor else ''
|
||||
a.set('href', newf+frag)
|
||||
changed = True
|
||||
|
||||
|
||||
if changed:
|
||||
open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
|
||||
|
||||
@ -448,7 +448,7 @@ def fix_ncx(path, changes):
|
||||
anchor_maps = [f.anchor_map for f in changes]
|
||||
tree = etree.parse(path)
|
||||
changed = False
|
||||
for content in tree.getroot().xpath('//x:content[@src]',
|
||||
for content in tree.getroot().xpath('//x:content[@src]',
|
||||
namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}):
|
||||
href = content.get('src')
|
||||
if not href.startswith('#'):
|
||||
@ -481,21 +481,21 @@ def find_html_files(opf):
|
||||
if os.path.exists(content(f)):
|
||||
html_files.append(f)
|
||||
return html_files
|
||||
|
||||
|
||||
|
||||
def split(pathtoopf, opts, stylesheet_map):
|
||||
pathtoopf = os.path.abspath(pathtoopf)
|
||||
opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
|
||||
|
||||
|
||||
with CurrentDir(os.path.dirname(pathtoopf)):
|
||||
html_files = find_html_files(opf)
|
||||
changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files]
|
||||
changes = [c for c in changes if c.was_split]
|
||||
|
||||
|
||||
fix_content_links(html_files, changes, opts)
|
||||
for item in opf.itermanifest():
|
||||
if item.get('media-type', '') == 'application/x-dtbncx+xml':
|
||||
fix_ncx(item.get('href'), changes)
|
||||
break
|
||||
break
|
||||
|
||||
open(pathtoopf, 'wb').write(opf.render())
|
||||
|
30
src/calibre/ebooks/html/__init__.py
Normal file
30
src/calibre/ebooks/html/__init__.py
Normal file
@ -0,0 +1,30 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import with_statement
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
from lxml.etree import tostring as _tostring
|
||||
|
||||
def tostring(root, strip_comments=False, pretty_print=False):
|
||||
'''
|
||||
Serialize processed XHTML.
|
||||
'''
|
||||
root.set('xmlns', 'http://www.w3.org/1999/xhtml')
|
||||
root.set('{http://www.w3.org/1999/xhtml}xlink', 'http://www.w3.org/1999/xlink')
|
||||
for x in root.iter():
|
||||
if x.tag.rpartition('}')[-1].lower() == 'svg':
|
||||
x.set('xmlns', 'http://www.w3.org/2000/svg')
|
||||
|
||||
ans = _tostring(root, encoding='utf-8', pretty_print=pretty_print)
|
||||
if strip_comments:
|
||||
ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans)
|
||||
ans = '<?xml version="1.0" encoding="utf-8" ?>\n'+ans
|
||||
|
||||
return ans
|
||||
|
||||
|
342
src/calibre/ebooks/html/input.py
Normal file
342
src/calibre/ebooks/html/input.py
Normal file
@ -0,0 +1,342 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import with_statement
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
'''
|
||||
Input plugin for HTML or OPF ebooks.
|
||||
'''
|
||||
|
||||
import os, re, sys, cStringIO
|
||||
from urlparse import urlparse, urlunparse
|
||||
from urllib import unquote
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.customize.conversion import OptionRecommendation
|
||||
from calibre import unicode_path
|
||||
|
||||
class Link(object):
|
||||
'''
|
||||
Represents a link in a HTML file.
|
||||
'''
|
||||
|
||||
@classmethod
|
||||
def url_to_local_path(cls, url, base):
|
||||
path = urlunparse(('', '', url.path, url.params, url.query, ''))
|
||||
path = unquote(path)
|
||||
if os.path.isabs(path):
|
||||
return path
|
||||
return os.path.abspath(os.path.join(base, path))
|
||||
|
||||
def __init__(self, url, base):
|
||||
'''
|
||||
:param url: The url this link points to. Must be an unquoted unicode string.
|
||||
:param base: The base directory that relative URLs are with respect to.
|
||||
Must be a unicode string.
|
||||
'''
|
||||
assert isinstance(url, unicode) and isinstance(base, unicode)
|
||||
self.url = url
|
||||
self.parsed_url = urlparse(self.url)
|
||||
self.is_local = self.parsed_url.scheme in ('', 'file')
|
||||
self.is_internal = self.is_local and not bool(self.parsed_url.path)
|
||||
self.path = None
|
||||
self.fragment = unquote(self.parsed_url.fragment)
|
||||
if self.is_local and not self.is_internal:
|
||||
self.path = self.url_to_local_path(self.parsed_url, base)
|
||||
|
||||
def __hash__(self):
|
||||
if self.path is None:
|
||||
return hash(self.url)
|
||||
return hash(self.path)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.path == getattr(other, 'path', other)
|
||||
|
||||
def __str__(self):
|
||||
return u'Link: %s --> %s'%(self.url, self.path)
|
||||
|
||||
|
||||
class IgnoreFile(Exception):
|
||||
|
||||
def __init__(self, msg, errno):
|
||||
Exception.__init__(self, msg)
|
||||
self.doesnt_exist = errno == 2
|
||||
self.errno = errno
|
||||
|
||||
class HTMLFile(object):
|
||||
'''
|
||||
Contains basic information about an HTML file. This
|
||||
includes a list of links to other files as well as
|
||||
the encoding of each file. Also tries to detect if the file is not a HTML
|
||||
file in which case :member:`is_binary` is set to True.
|
||||
|
||||
The encoding of the file is available as :member:`encoding`.
|
||||
'''
|
||||
|
||||
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
|
||||
TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
|
||||
LINK_PAT = re.compile(
|
||||
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
|
||||
re.DOTALL|re.IGNORECASE)
|
||||
|
||||
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
|
||||
'''
|
||||
:param level: The level of this file. Should be 0 for the root file.
|
||||
:param encoding: Use `encoding` to decode HTML.
|
||||
:param referrer: The :class:`HTMLFile` that first refers to this file.
|
||||
'''
|
||||
self.path = unicode_path(path_to_html_file, abs=True)
|
||||
self.title = os.path.splitext(os.path.basename(self.path))[0]
|
||||
self.base = os.path.dirname(self.path)
|
||||
self.level = level
|
||||
self.referrer = referrer
|
||||
self.links = []
|
||||
|
||||
try:
|
||||
with open(self.path, 'rb') as f:
|
||||
src = f.read()
|
||||
except IOError, err:
|
||||
msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
|
||||
if level == 0:
|
||||
raise IOError(msg)
|
||||
raise IgnoreFile(msg, err.errno)
|
||||
|
||||
self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
|
||||
if not self.is_binary:
|
||||
if encoding is None:
|
||||
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
|
||||
self.encoding = encoding
|
||||
else:
|
||||
self.encoding = encoding
|
||||
|
||||
src = src.decode(encoding, 'replace')
|
||||
match = self.TITLE_PAT.search(src)
|
||||
self.title = match.group(1) if match is not None else self.title
|
||||
self.find_links(src)
|
||||
|
||||
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.path == getattr(other, 'path', other)
|
||||
|
||||
def __str__(self):
|
||||
return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
|
||||
def find_links(self, src):
|
||||
for match in self.LINK_PAT.finditer(src):
|
||||
url = None
|
||||
for i in ('url1', 'url2', 'url3'):
|
||||
url = match.group(i)
|
||||
if url:
|
||||
break
|
||||
link = self.resolve(url)
|
||||
if link not in self.links:
|
||||
self.links.append(link)
|
||||
|
||||
def resolve(self, url):
|
||||
return Link(url, self.base)
|
||||
|
||||
|
||||
def depth_first(root, flat, visited=set([])):
|
||||
yield root
|
||||
visited.add(root)
|
||||
for link in root.links:
|
||||
if link.path is not None and link not in visited:
|
||||
try:
|
||||
index = flat.index(link)
|
||||
except ValueError: # Can happen if max_levels is used
|
||||
continue
|
||||
hf = flat[index]
|
||||
if hf not in visited:
|
||||
yield hf
|
||||
visited.add(hf)
|
||||
for hf in depth_first(hf, flat, visited):
|
||||
if hf not in visited:
|
||||
yield hf
|
||||
visited.add(hf)
|
||||
|
||||
|
||||
def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
|
||||
'''
|
||||
Recursively traverse all links in the HTML file.
|
||||
|
||||
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
|
||||
implies that no links in the root HTML file are followed.
|
||||
:param encoding: Specify character encoding of HTML files. If `None` it is
|
||||
auto-detected.
|
||||
:return: A pair of lists (breadth_first, depth_first). Each list contains
|
||||
:class:`HTMLFile` objects.
|
||||
'''
|
||||
assert max_levels >= 0
|
||||
level = 0
|
||||
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
|
||||
next_level = list(flat)
|
||||
while level < max_levels and len(next_level) > 0:
|
||||
level += 1
|
||||
nl = []
|
||||
for hf in next_level:
|
||||
rejects = []
|
||||
for link in hf.links:
|
||||
if link.path is None or link.path in flat:
|
||||
continue
|
||||
try:
|
||||
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
|
||||
if nf.is_binary:
|
||||
raise IgnoreFile('%s is a binary file'%nf.path, -1)
|
||||
nl.append(nf)
|
||||
flat.append(nf)
|
||||
except IgnoreFile, err:
|
||||
rejects.append(link)
|
||||
if not err.doesnt_exist or verbose > 1:
|
||||
print repr(err)
|
||||
for link in rejects:
|
||||
hf.links.remove(link)
|
||||
|
||||
next_level = list(nl)
|
||||
orec = sys.getrecursionlimit()
|
||||
sys.setrecursionlimit(500000)
|
||||
try:
|
||||
return flat, list(depth_first(flat[0], flat))
|
||||
finally:
|
||||
sys.setrecursionlimit(orec)
|
||||
|
||||
|
||||
def opf_traverse(opf_reader, verbose=0, encoding=None):
|
||||
'''
|
||||
Return a list of :class:`HTMLFile` objects in the order specified by the
|
||||
`<spine>` element of the OPF.
|
||||
|
||||
:param opf_reader: An :class:`calibre.ebooks.metadata.opf2.OPF` instance.
|
||||
:param encoding: Specify character encoding of HTML files. If `None` it is
|
||||
auto-detected.
|
||||
'''
|
||||
if not opf_reader.spine:
|
||||
raise ValueError('OPF does not have a spine')
|
||||
flat = []
|
||||
for path in opf_reader.spine.items():
|
||||
path = os.path.abspath(path)
|
||||
if path not in flat:
|
||||
flat.append(os.path.abspath(path))
|
||||
for item in opf_reader.manifest:
|
||||
if 'html' in item.mime_type:
|
||||
path = os.path.abspath(item.path)
|
||||
if path not in flat:
|
||||
flat.append(path)
|
||||
for i, path in enumerate(flat):
|
||||
if not os.path.exists(path):
|
||||
path = path.replace('&', '%26')
|
||||
if os.path.exists(path):
|
||||
flat[i] = path
|
||||
for item in opf_reader.itermanifest():
|
||||
item.set('href', item.get('href').replace('&', '%26'))
|
||||
ans = []
|
||||
for path in flat:
|
||||
if os.path.exists(path):
|
||||
ans.append(HTMLFile(path, 0, encoding, verbose))
|
||||
else:
|
||||
print 'WARNING: OPF spine item %s does not exist'%path
|
||||
ans = [f for f in ans if not f.is_binary]
|
||||
return ans
|
||||
|
||||
def search_for_opf(dir):
|
||||
for f in os.listdir(dir):
|
||||
if f.lower().endswith('.opf'):
|
||||
return OPF(open(os.path.join(dir, f), 'rb'), dir)
|
||||
|
||||
def get_filelist(htmlfile, dir, opts, log):
|
||||
'''
|
||||
Build list of files referenced by html file or try to detect and use an
|
||||
OPF file instead.
|
||||
'''
|
||||
print 'Building file list...'
|
||||
opf = search_for_opf(dir)
|
||||
filelist = None
|
||||
if opf is not None:
|
||||
try:
|
||||
filelist = opf_traverse(opf, verbose=opts.verbose,
|
||||
encoding=opts.input_encoding)
|
||||
except:
|
||||
pass
|
||||
if not filelist:
|
||||
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
||||
verbose=opts.verbose,
|
||||
encoding=opts.input_encoding)\
|
||||
[0 if opts.breadth_first else 1]
|
||||
if opts.verbose:
|
||||
log.debug('\tFound files...')
|
||||
for f in filelist:
|
||||
log.debug('\t\t', f)
|
||||
return opf, filelist
|
||||
|
||||
|
||||
class HTMLInput(InputFormatPlugin):
|
||||
|
||||
name = 'HTML Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert HTML and OPF files to an OEB'
|
||||
file_types = set(['opf', 'html', 'htm', 'xhtml', 'xhtm'])
|
||||
|
||||
options = set([
|
||||
OptionRecommendation(name='breadth_first',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Traverse links in HTML files breadth first. Normally, '
|
||||
'they are traversed depth first.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='max_levels',
|
||||
recommended_value=5, level=OptionRecommendation.LOW,
|
||||
help=_('Maximum levels of recursion when following links in '
|
||||
'HTML files. Must be non-negative. 0 implies that no '
|
||||
'links in the root HTML file are followed. Default is '
|
||||
'%default.'
|
||||
)
|
||||
),
|
||||
|
||||
])
|
||||
|
||||
def convert(self, stream, opts, file_ext, log,
|
||||
accelerators):
|
||||
basedir = os.getcwd()
|
||||
if hasattr(stream, 'name'):
|
||||
basedir = os.path.dirname(stream.name)
|
||||
if file_ext == 'opf':
|
||||
opf = OPF(stream, basedir)
|
||||
filelist = opf_traverse(opf, verbose=opts.verbose,
|
||||
encoding=opts.input_encoding)
|
||||
mi = MetaInformation(opf)
|
||||
else:
|
||||
opf, filelist = get_filelist(stream.name, basedir, opts, log)
|
||||
mi = MetaInformation(opf)
|
||||
mi.smart_update(get_metadata(stream, 'html'))
|
||||
|
||||
mi = OPFCreator(os.getcwdu(), mi)
|
||||
mi.guide = None
|
||||
entries = [(f.path, 'application/xhtml+xml') for f in filelist]
|
||||
mi.create_manifest(entries)
|
||||
mi.create_spine([f.path for f in filelist])
|
||||
|
||||
tocbuf = cStringIO.StringIO()
|
||||
mi.render(open('metadata.opf', 'wb'), tocbuf, 'toc.ncx')
|
||||
toc = tocbuf.getvalue()
|
||||
if toc:
|
||||
open('toc.ncx', 'wb').write(toc)
|
||||
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
return create_oebbook(log, os.path.abspath('metadata.opf'))
|
||||
|
||||
|
||||
|
||||
|
@ -683,26 +683,6 @@ class OPF(object):
|
||||
|
||||
return property(fget=fget, fset=fset)
|
||||
|
||||
@dynamic_property
|
||||
def title_sort(self):
|
||||
|
||||
def fget(self):
|
||||
matches = self.title_path(self.metadata)
|
||||
if matches:
|
||||
for match in matches:
|
||||
ans = match.get('{%s}file-as'%self.NAMESPACES['opf'], None)
|
||||
if not ans:
|
||||
ans = match.get('file-as', None)
|
||||
if ans:
|
||||
return ans
|
||||
|
||||
def fset(self, val):
|
||||
matches = self.title_path(self.metadata)
|
||||
if matches:
|
||||
matches[0].set('file-as', unicode(val))
|
||||
|
||||
return property(fget=fget, fset=fset)
|
||||
|
||||
@dynamic_property
|
||||
def tags(self):
|
||||
|
||||
@ -943,9 +923,10 @@ class OPFCreator(MetaInformation):
|
||||
from calibre.resources import opf_template
|
||||
from calibre.utils.genshi.template import MarkupTemplate
|
||||
template = MarkupTemplate(opf_template)
|
||||
toc = getattr(self, 'toc', None)
|
||||
if self.manifest:
|
||||
self.manifest.set_basedir(self.base_path)
|
||||
if ncx_manifest_entry is not None:
|
||||
if ncx_manifest_entry is not None and toc is not None:
|
||||
if not os.path.isabs(ncx_manifest_entry):
|
||||
ncx_manifest_entry = os.path.join(self.base_path, ncx_manifest_entry)
|
||||
remove = [i for i in self.manifest if i.id == 'ncx']
|
||||
@ -965,7 +946,6 @@ class OPFCreator(MetaInformation):
|
||||
opf = template.generate(__appname__=__appname__, mi=self, __version__=__version__).render('xml')
|
||||
opf_stream.write(opf)
|
||||
opf_stream.flush()
|
||||
toc = getattr(self, 'toc', None)
|
||||
if toc is not None and ncx_stream is not None:
|
||||
toc.render(ncx_stream, self.application_id)
|
||||
ncx_stream.flush()
|
||||
@ -1030,19 +1010,8 @@ class OPFTest(unittest.TestCase):
|
||||
self.opf.smart_update(MetaInformation(self.opf))
|
||||
self.testReading()
|
||||
|
||||
def testCreator(self):
|
||||
opf = OPFCreator(os.getcwd(), self.opf)
|
||||
buf = cStringIO.StringIO()
|
||||
opf.render(buf)
|
||||
raw = buf.getvalue()
|
||||
self.testReading(opf=OPF(cStringIO.StringIO(raw), os.getcwd()))
|
||||
|
||||
def testSmartUpdate(self):
|
||||
self.opf.smart_update(self.opf)
|
||||
self.testReading()
|
||||
|
||||
def suite():
|
||||
return unittest.TestLoader().loadTestsFromTestCase(OPFTest)
|
||||
|
||||
def test():
|
||||
unittest.TextTestRunner(verbosity=2).run(suite())
|
||||
unittest.TextTestRunner(verbosity=2).run(suite())
|
||||
|
@ -29,5 +29,5 @@ class MOBIInput(InputFormatPlugin):
|
||||
with open(f, 'wb') as q:
|
||||
q.write(html.tostring(root, encoding='utf-8', method='xml',
|
||||
include_meta_content_type=False))
|
||||
accelerators['pagebreaks'] = {f: '//div[@class="mbp_pagebreak"]'}
|
||||
accelerators['pagebreaks'] = {f: '//*[@class="mbp_pagebreak"]'}
|
||||
return mr.created_opf_path
|
||||
|
@ -522,7 +522,7 @@ class MobiReader(object):
|
||||
else:
|
||||
raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type))
|
||||
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
|
||||
self.mobi_html = self.mobi_html.replace('\r ', '\n\n ')
|
||||
self.mobi_html = self.mobi_html.replace('\r ', '\n\n').replace('\0', '')
|
||||
return processed_records
|
||||
|
||||
|
||||
|
@ -151,7 +151,7 @@ def resolve_base_href(root):
|
||||
return
|
||||
make_links_absolute(root, base_href, resolve_base_href=False)
|
||||
|
||||
def rewrite_links(root, link_repl_func, resolve_base_href=True):
|
||||
def rewrite_links(root, link_repl_func, resolve_base_href=False):
|
||||
'''
|
||||
Rewrite all the links in the document. For each link
|
||||
``link_repl_func(link)`` will be called, and the return value
|
||||
|
@ -6,9 +6,16 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, shutil
|
||||
import os
|
||||
from urllib import unquote as urlunquote
|
||||
from functools import partial
|
||||
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS
|
||||
from lxml import etree
|
||||
import cssutils
|
||||
|
||||
from calibre.constants import islinux
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS, urlnormalize, urldefrag, \
|
||||
rewrite_links
|
||||
|
||||
class Package(object):
|
||||
|
||||
@ -29,18 +36,69 @@ class Package(object):
|
||||
self.new_base_path = os.path.abspath(base)
|
||||
|
||||
def rewrite_links_in(self, item):
|
||||
new_items = []
|
||||
return new_items
|
||||
base = os.path.join(self.new_base_path, *item.href.split('/'))
|
||||
base = os.path.dirname(base)
|
||||
|
||||
if etree.iselement(item.data):
|
||||
self.rewrite_links_in_xml(item.data, base)
|
||||
elif hasattr(item.data, 'cssText'):
|
||||
self.rewrite_links_in_css(item.data, base)
|
||||
|
||||
def link_replacer(self, link_, base=''):
|
||||
link = urlnormalize(link_)
|
||||
link, frag = urldefrag(link)
|
||||
link = urlunquote(link).replace('/', os.sep)
|
||||
if base and not os.path.isabs(link):
|
||||
link = os.path.join(base, link)
|
||||
link = os.path.abspath(link)
|
||||
if not islinux:
|
||||
link = link.lower()
|
||||
if link not in self.map:
|
||||
return link_
|
||||
nlink = os.path.relpath(self.map[link], base)
|
||||
if frag:
|
||||
nlink = '#'.join(nlink, frag)
|
||||
return nlink.replace(os.sep, '/')
|
||||
|
||||
def rewrite_links_in_css(self, sheet, base):
|
||||
repl = partial(self.link_replacer, base=base)
|
||||
cssutils.replaceUrls(sheet, repl)
|
||||
|
||||
def rewrite_links_in_xml(self, root, base):
|
||||
repl = partial(self.link_replacer, base=base)
|
||||
rewrite_links(root, repl)
|
||||
|
||||
def move_manifest_item(self, item):
|
||||
item.data # Make sure the data has been loaded and cached
|
||||
old_abspath = os.path.join(self.old_base_path, *item.href.split('/'))
|
||||
bname = item.href.split('/')[-1]
|
||||
new_href = 'content/' + \
|
||||
('resources/' if item.media_type in OEB_DOCS else '')+bname
|
||||
old_abspath = os.path.join(self.old_base_path,
|
||||
*(urldefrag(item.href)[0].split('/')))
|
||||
old_abspath = os.path.abspath(old_abspath)
|
||||
bname = item.href.split('/')[-1].partition('#')[0]
|
||||
new_href = 'content/resources/'
|
||||
if item.media_type in OEB_DOCS:
|
||||
new_href = 'content/'
|
||||
elif item.href.lower().endswith('.ncx'):
|
||||
new_href = ''
|
||||
new_href += bname
|
||||
|
||||
new_abspath = os.path.join(self.new_base_path, *new_href.split('/'))
|
||||
new_abspath = os.path.abspath(new_abspath)
|
||||
item.href = new_href
|
||||
if not islinux:
|
||||
old_abspath, new_abspath = old_abspath.lower(), new_abspath.lower()
|
||||
if old_abspath != new_abspath:
|
||||
self.map[old_abspath] = new_abspath
|
||||
|
||||
def rewrite_links_in_toc(self, toc):
|
||||
if toc.href:
|
||||
toc.href = self.link_replacer(toc.href, base=self.new_base_path)
|
||||
|
||||
for x in toc:
|
||||
self.rewrite_links_in_toc(x)
|
||||
|
||||
def __call__(self, oeb, context):
|
||||
self.map = {}
|
||||
self.log = self.oeb.log
|
||||
self.old_base_path = os.path.abspath(oeb.container.rootdir)
|
||||
|
||||
for item in self.oeb.manifest:
|
||||
@ -49,4 +107,9 @@ class Package(object):
|
||||
for item in self.oeb.manifest:
|
||||
self.rewrite_links_in(item)
|
||||
|
||||
if getattr(oeb.toc, 'nodes', False):
|
||||
self.rewrite_links_in_toc(oeb.toc)
|
||||
|
||||
if hasattr(oeb, 'guide'):
|
||||
for ref in oeb.guide.values():
|
||||
ref.href = self.link_replacer(ref.href, base=self.new_base_path)
|
||||
|
@ -6,11 +6,12 @@ from __future__ import with_statement
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
|
||||
from itertools import chain
|
||||
from urlparse import urldefrag
|
||||
|
||||
import cssutils
|
||||
|
||||
from calibre.ebooks.oeb.base import CSS_MIME, OEB_DOCS
|
||||
from calibre.ebooks.oeb.base import LINK_SELECTORS, CSSURL_RE
|
||||
from calibre.ebooks.oeb.base import urlnormalize
|
||||
from calibre.ebooks.oeb.base import urlnormalize, iterlinks
|
||||
|
||||
class ManifestTrimmer(object):
|
||||
@classmethod
|
||||
@ -44,16 +45,15 @@ class ManifestTrimmer(object):
|
||||
if (item.media_type in OEB_DOCS or
|
||||
item.media_type[-4:] in ('/xml', '+xml')) and \
|
||||
item.data is not None:
|
||||
hrefs = [sel(item.data) for sel in LINK_SELECTORS]
|
||||
for href in chain(*hrefs):
|
||||
hrefs = [r[2] for r in iterlinks(item.data)]
|
||||
for href in hrefs:
|
||||
href = item.abshref(urlnormalize(href))
|
||||
if href in oeb.manifest.hrefs:
|
||||
found = oeb.manifest.hrefs[href]
|
||||
if found not in used:
|
||||
new.add(found)
|
||||
elif item.media_type == CSS_MIME:
|
||||
for match in CSSURL_RE.finditer(item.data.cssText):
|
||||
href = match.group('url')
|
||||
for href in cssutils.getUrls(item.data):
|
||||
href = item.abshref(urlnormalize(href))
|
||||
if href in oeb.manifest.hrefs:
|
||||
found = oeb.manifest.hrefs[href]
|
||||
|
@ -22,9 +22,6 @@ entry_points = {
|
||||
'web2disk = calibre.web.fetch.simple:main',
|
||||
'feeds2disk = calibre.web.feeds.main:main',
|
||||
'calibre-server = calibre.library.server:main',
|
||||
'feeds2lrf = calibre.ebooks.lrf.feeds.convert_from:main',
|
||||
'feeds2epub = calibre.ebooks.epub.from_feeds:main',
|
||||
'feeds2mobi = calibre.ebooks.mobi.from_feeds:main',
|
||||
'web2lrf = calibre.ebooks.lrf.web.convert_from:main',
|
||||
'lrf2lrs = calibre.ebooks.lrf.lrfparser:main',
|
||||
'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main',
|
||||
@ -154,10 +151,7 @@ def setup_completion(fatal_errors):
|
||||
from calibre.ebooks.lrf.pdf.reflow import option_parser as pdfhtmlop
|
||||
from calibre.web.feeds.main import option_parser as feeds2disk
|
||||
from calibre.web.feeds.recipes import titles as feed_titles
|
||||
from calibre.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf
|
||||
from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop
|
||||
from calibre.ebooks.epub.from_feeds import option_parser as feeds2epub
|
||||
from calibre.ebooks.mobi.from_feeds import option_parser as feeds2mobi
|
||||
from calibre.ebooks.epub.from_comic import option_parser as comic2epub
|
||||
from calibre.ebooks.metadata.fetch import option_parser as fem_op
|
||||
from calibre.gui2.main import option_parser as guiop
|
||||
@ -192,9 +186,6 @@ def setup_completion(fatal_errors):
|
||||
f.write(opts_and_exts('comic2mobi', comic2epub, ['cbz', 'cbr']))
|
||||
f.write(opts_and_exts('comic2pdf', comic2epub, ['cbz', 'cbr']))
|
||||
f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
|
||||
f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
|
||||
f.write(opts_and_words('feeds2epub', feeds2epub, feed_titles))
|
||||
f.write(opts_and_words('feeds2mobi', feeds2mobi, feed_titles))
|
||||
f.write(opts_and_words('fetch-ebook-metadata', fem_op, []))
|
||||
f.write(opts_and_words('calibre-smtp', smtp_op, []))
|
||||
f.write('''
|
||||
|
Loading…
x
Reference in New Issue
Block a user