Untested implementation of HTML input. Uses a new transform that 'packages' an OEB book into a folder structure (the same folder structure that was used in the old codebase for EPUB output). This may have broken other thin gs, so use with care.

This commit is contained in:
Kovid Goyal 2009-04-08 17:44:29 -07:00
parent b2bfab32cf
commit 093b98a9f1
17 changed files with 609 additions and 206 deletions

View File

@ -122,8 +122,9 @@ class InputFormatPlugin(Plugin):
def convert(self, stream, options, file_ext, log, accelerators): def convert(self, stream, options, file_ext, log, accelerators):
''' '''
This method must be implemented in sub-classes. It must return This method must be implemented in sub-classes. It must return
the path to the created OPF file. All output should be contained in the path to the created OPF file or an :class:`OEBBook` instance.
the current directory. If this plugin creates files outside the current All output should be contained in the current directory.
If this plugin creates files outside the current
directory they must be deleted/marked for deletion before this method directory they must be deleted/marked for deletion before this method
returns. returns.

View File

@ -299,21 +299,15 @@ OptionRecommendation(name='language',
# Create an OEBBook from the input file. The input plugin does all the # Create an OEBBook from the input file. The input plugin does all the
# heavy lifting. # heavy lifting.
from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks.oeb.base import OEBBook
accelerators = {} accelerators = {}
tdir = PersistentTemporaryDirectory('_plumber') tdir = PersistentTemporaryDirectory('_plumber')
opfpath = self.input_plugin(open(self.input, 'rb'), self.opts, self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts,
self.input_fmt, self.log, self.input_fmt, self.log,
accelerators, tdir) accelerators, tdir)
html_preprocessor = HTMLPreProcessor() if not hasattr(self.oeb, 'manifest'):
self.reader = OEBReader() self.oeb = create_oebbook(self.log, self.oeb)
self.oeb = OEBBook(self.log, html_preprocessor=html_preprocessor)
# Read OEB Book into OEBBook
self.log.info('Parsing all content...')
self.reader(self.oeb, opfpath)
self.opts.source = self.opts.input_profile self.opts.source = self.opts.input_profile
self.opts.dest = self.opts.output_profile self.opts.dest = self.opts.output_profile
@ -340,7 +334,20 @@ OptionRecommendation(name='language',
trimmer(self.oeb, self.opts) trimmer(self.oeb, self.opts)
self.log.info('Creating %s...'%self.output_plugin.name) self.log.info('Creating %s...'%self.output_plugin.name)
self.output_plugin.convert(self.oeb, self.output, self.input_plugin, self.opts, self.output_plugin.convert(self.oeb, self.output, self.input_plugin,
self.log) self.opts, self.log)
def create_oebbook(log, opfpath):
'''
Create an OEBBook from an OPF file.
'''
from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor()
reader = OEBReader()
oeb = OEBBook(log, html_preprocessor=html_preprocessor)
# Read OEB Book into OEBBook
log.info('Parsing all content...')
reader(oeb, opfpath)
return oeb

View File

@ -10,23 +10,23 @@ import sys, textwrap, re, os, uuid
from itertools import cycle from itertools import cycle
from calibre.utils.config import Config, StringConfig from calibre.utils.config import Config, StringConfig
from calibre.utils.zipfile import ZipFile, ZIP_STORED from calibre.utils.zipfile import ZipFile, ZIP_STORED
from calibre.ebooks.html import config as common_config, tostring from calibre.ebooks.html import tostring
from lxml import etree from lxml import etree
class DefaultProfile(object): class DefaultProfile(object):
flow_size = sys.maxint flow_size = sys.maxint
screen_size = None screen_size = None
remove_special_chars = False remove_special_chars = False
remove_object_tags = False remove_object_tags = False
class PRS505(DefaultProfile): class PRS505(DefaultProfile):
flow_size = 270000 flow_size = 270000
screen_size = (590, 765) screen_size = (590, 765)
remove_special_chars = re.compile(u'[\u200b\u00ad]') remove_special_chars = re.compile(u'[\u200b\u00ad]')
remove_object_tags = True remove_object_tags = True
PROFILES = { PROFILES = {
'PRS505' : PRS505, 'PRS505' : PRS505,
@ -64,11 +64,11 @@ def config(defaults=None, name='epub'):
c = Config(name, desc) c = Config(name, desc)
else: else:
c = StringConfig(defaults, desc) c = StringConfig(defaults, desc)
c.update(common_config()) c.update(common_config())
c.remove_opt('output') c.remove_opt('output')
c.remove_opt('zip') c.remove_opt('zip')
c.add_opt('output', ['-o', '--output'], default=None, c.add_opt('output', ['-o', '--output'], default=None,
help=_('The output EPUB file. If not specified, it is ' help=_('The output EPUB file. If not specified, it is '
'derived from the input file name.')) 'derived from the input file name.'))
@ -81,22 +81,22 @@ def config(defaults=None, name='epub'):
help=_('Either the path to a CSS stylesheet or raw CSS. ' help=_('Either the path to a CSS stylesheet or raw CSS. '
'This CSS will override any existing CSS ' 'This CSS will override any existing CSS '
'declarations in the source files.')) 'declarations in the source files.'))
structure = c.add_group('structure detection', structure = c.add_group('structure detection',
_('Control auto-detection of document structure.')) _('Control auto-detection of document structure.'))
structure('chapter', ['--chapter'], structure('chapter', ['--chapter'],
default="//*[re:match(name(), 'h[1-2]') and " default="//*[re:match(name(), 'h[1-2]') and "
"re:test(., 'chapter|book|section|part', 'i')] | " "re:test(., 'chapter|book|section|part', 'i')] | "
"//*[@class = 'chapter']", "//*[@class = 'chapter']",
help=_('''\ help=_('''\
An XPath expression to detect chapter titles. The default is to consider <h1> or An XPath expression to detect chapter titles. The default is to consider <h1> or
<h2> tags that contain the words "chapter","book","section" or "part" as chapter titles as <h2> tags that contain the words "chapter","book","section" or "part" as chapter titles as
well as any tags that have class="chapter". well as any tags that have class="chapter".
The expression used must evaluate to a list of elements. To disable chapter detection, The expression used must evaluate to a list of elements. To disable chapter detection,
use the expression "/". See the XPath Tutorial in the calibre User Manual for further use the expression "/". See the XPath Tutorial in the calibre User Manual for further
help on using this feature. help on using this feature.
''').replace('\n', ' ')) ''').replace('\n', ' '))
structure('chapter_mark', ['--chapter-mark'], choices=['pagebreak', 'rule', 'both', 'none'], structure('chapter_mark', ['--chapter-mark'], choices=['pagebreak', 'rule', 'both', 'none'],
default='pagebreak', default='pagebreak',
help=_('Specify how to mark detected chapters. A value of ' help=_('Specify how to mark detected chapters. A value of '
'"pagebreak" will insert page breaks before chapters. ' '"pagebreak" will insert page breaks before chapters. '
'A value of "rule" will insert a line before chapters. ' 'A value of "rule" will insert a line before chapters. '
@ -129,13 +129,13 @@ help on using this feature.
help=_('XPath expression to find the name of each page in the ' help=_('XPath expression to find the name of each page in the '
'pagination map relative to its boundary element. ' 'pagination map relative to its boundary element. '
'Default is to number all pages staring with 1.')) 'Default is to number all pages staring with 1.'))
toc = c.add_group('toc', toc = c.add_group('toc',
_('''\ _('''\
Control the automatic generation of a Table of Contents. If an OPF file is detected Control the automatic generation of a Table of Contents. If an OPF file is detected
and it specifies a Table of Contents, then that will be used rather than trying and it specifies a Table of Contents, then that will be used rather than trying
to auto-generate a Table of Contents. to auto-generate a Table of Contents.
''').replace('\n', ' ')) ''').replace('\n', ' '))
toc('max_toc_links', ['--max-toc-links'], default=50, toc('max_toc_links', ['--max-toc-links'], default=50,
help=_('Maximum number of links to insert into the TOC. Set to 0 ' help=_('Maximum number of links to insert into the TOC. Set to 0 '
'to disable. Default is: %default. Links are only added to the ' 'to disable. Default is: %default. Links are only added to the '
'TOC if less than the --toc-threshold number of chapters were detected.')) 'TOC if less than the --toc-threshold number of chapters were detected.'))
@ -166,15 +166,15 @@ to auto-generate a Table of Contents.
help=_('Normally, if the source file already has a Table of Contents, ' help=_('Normally, if the source file already has a Table of Contents, '
'it is used in preference to the auto-generated one. ' 'it is used in preference to the auto-generated one. '
'With this option, the auto-generated one is always used.')) 'With this option, the auto-generated one is always used.'))
layout = c.add_group('page layout', _('Control page layout')) layout = c.add_group('page layout', _('Control page layout'))
layout('margin_top', ['--margin-top'], default=5.0, layout('margin_top', ['--margin-top'], default=5.0,
help=_('Set the top margin in pts. Default is %default')) help=_('Set the top margin in pts. Default is %default'))
layout('margin_bottom', ['--margin-bottom'], default=5.0, layout('margin_bottom', ['--margin-bottom'], default=5.0,
help=_('Set the bottom margin in pts. Default is %default')) help=_('Set the bottom margin in pts. Default is %default'))
layout('margin_left', ['--margin-left'], default=5.0, layout('margin_left', ['--margin-left'], default=5.0,
help=_('Set the left margin in pts. Default is %default')) help=_('Set the left margin in pts. Default is %default'))
layout('margin_right', ['--margin-right'], default=5.0, layout('margin_right', ['--margin-right'], default=5.0,
help=_('Set the right margin in pts. Default is %default')) help=_('Set the right margin in pts. Default is %default'))
layout('base_font_size2', ['--base-font-size'], default=12.0, layout('base_font_size2', ['--base-font-size'], default=12.0,
help=_('The base font size in pts. Default is %defaultpt. ' help=_('The base font size in pts. Default is %defaultpt. '
@ -195,12 +195,12 @@ to auto-generate a Table of Contents.
'This is only neccessary if the HTML files contain CSS that ' 'This is only neccessary if the HTML files contain CSS that '
'uses sibling selectors. Enabling this greatly slows down ' 'uses sibling selectors. Enabling this greatly slows down '
'processing of large HTML files.')) 'processing of large HTML files.'))
c.add_opt('show_opf', ['--show-opf'], default=False, group='debug', c.add_opt('show_opf', ['--show-opf'], default=False, group='debug',
help=_('Print generated OPF file to stdout')) help=_('Print generated OPF file to stdout'))
c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug', c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug',
help=_('Print generated NCX file to stdout')) help=_('Print generated NCX file to stdout'))
c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug', c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug',
default=False, default=False,
help=_('Keep intermediate files during processing by html2epub')) help=_('Keep intermediate files during processing by html2epub'))
c.add_opt('extract_to', ['--extract-to'], group='debug', default=None, c.add_opt('extract_to', ['--extract-to'], group='debug', default=None,

View File

@ -14,7 +14,7 @@ from lxml.cssselect import CSSSelector
from lxml import etree from lxml import etree
from lxml.html import HtmlElement from lxml.html import HtmlElement
from calibre.ebooks.html import fromstring from calibre.ebooks.html_old import fromstring
from calibre.ebooks.epub import rules from calibre.ebooks.epub import rules
from cssutils import CSSParser from cssutils import CSSParser
@ -24,7 +24,7 @@ absolute_size = r'(?P<abs>(x?x-)?(small|large)|medium)'
relative_size = r'(?P<rel>smaller|larger)' relative_size = r'(?P<rel>smaller|larger)'
font_size_pat = re.compile('|'.join((relative_size, absolute_size, length)), re.I) font_size_pat = re.compile('|'.join((relative_size, absolute_size, length)), re.I)
line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num)) line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num))
PTU = { PTU = {
'in' : 72., 'in' : 72.,
@ -37,12 +37,12 @@ PTU = {
DEFAULT_FONT_SIZE = 12 DEFAULT_FONT_SIZE = 12
class Rationalizer(object): class Rationalizer(object):
@classmethod @classmethod
def specificity(cls, s): def specificity(cls, s):
'''Map CSS specificity tuple to a single integer''' '''Map CSS specificity tuple to a single integer'''
return sum([10**(4-i) + x for i,x in enumerate(s)]) return sum([10**(4-i) + x for i,x in enumerate(s)])
@classmethod @classmethod
def compute_font_size(cls, elem): def compute_font_size(cls, elem):
''' '''
@ -59,7 +59,7 @@ class Rationalizer(object):
elem.computed_font_size = sfs(parent.computed_font_size) elem.computed_font_size = sfs(parent.computed_font_size)
else: else:
elem.computed_font_size = sfs elem.computed_font_size = sfs
@classmethod @classmethod
def calculate_font_size(cls, style): def calculate_font_size(cls, style):
'Return font size in pts from style object. For relative units returns a callable' 'Return font size in pts from style object. For relative units returns a callable'
@ -69,7 +69,7 @@ class Rationalizer(object):
fs = match.group() fs = match.group()
if style.fontSize: if style.fontSize:
fs = style.fontSize fs = style.fontSize
match = font_size_pat.search(fs) match = font_size_pat.search(fs)
if match is None: if match is None:
return None return None
@ -89,8 +89,8 @@ class Rationalizer(object):
return 12 * x return 12 * x
if match.get('zero', False): if match.get('zero', False):
return 0. return 0.
return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8) return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8)
@classmethod @classmethod
def resolve_rules(cls, stylesheets): def resolve_rules(cls, stylesheets):
for sheet in stylesheets: for sheet in stylesheets:
@ -104,12 +104,12 @@ class Rationalizer(object):
if font_size is not None: if font_size is not None:
for s in r.selectorList: for s in r.selectorList:
sheet.fs_rules.append([CSSSelector(s.selectorText), font_size]) sheet.fs_rules.append([CSSSelector(s.selectorText), font_size])
orig = line_height_pat.search(r.style.lineHeight) orig = line_height_pat.search(r.style.lineHeight)
if orig is not None: if orig is not None:
for s in r.selectorList: for s in r.selectorList:
sheet.lh_rules.append([CSSSelector(s.selectorText), float(orig.group(1)) * PTU[orig.group(2).lower()]]) sheet.lh_rules.append([CSSSelector(s.selectorText), float(orig.group(1)) * PTU[orig.group(2).lower()]])
@classmethod @classmethod
def apply_font_size_rules(cls, stylesheets, root): def apply_font_size_rules(cls, stylesheets, root):
'Add a ``specified_font_size`` attribute to every element that has a specified font size' 'Add a ``specified_font_size`` attribute to every element that has a specified font size'
@ -119,7 +119,7 @@ class Rationalizer(object):
elems = selector(root) elems = selector(root)
for elem in elems: for elem in elems:
elem.specified_font_size = font_size elem.specified_font_size = font_size
@classmethod @classmethod
def remove_font_size_information(cls, stylesheets): def remove_font_size_information(cls, stylesheets):
for r in rules(stylesheets): for r in rules(stylesheets):
@ -134,17 +134,17 @@ class Rationalizer(object):
r.style.removeProperty('font') r.style.removeProperty('font')
if line_height_pat.search(r.style.lineHeight) is not None: if line_height_pat.search(r.style.lineHeight) is not None:
r.style.removeProperty('line-height') r.style.removeProperty('line-height')
@classmethod @classmethod
def compute_font_sizes(cls, root, stylesheets, base=12): def compute_font_sizes(cls, root, stylesheets, base=12):
stylesheets = [s for s in stylesheets if hasattr(s, 'cssText')] stylesheets = [s for s in stylesheets if hasattr(s, 'cssText')]
cls.apply_font_size_rules(stylesheets, root) cls.apply_font_size_rules(stylesheets, root)
# Compute the effective font size of all tags # Compute the effective font size of all tags
root.computed_font_size = DEFAULT_FONT_SIZE root.computed_font_size = DEFAULT_FONT_SIZE
for elem in root.iter(etree.Element): for elem in root.iter(etree.Element):
cls.compute_font_size(elem) cls.compute_font_size(elem)
extra_css = {} extra_css = {}
if base > 0: if base > 0:
# Calculate the "base" (i.e. most common) font size # Calculate the "base" (i.e. most common) font size
@ -157,20 +157,20 @@ class Rationalizer(object):
if t: t = t.strip() if t: t = t.strip()
if t: if t:
font_sizes[elem.computed_font_size] += len(t) font_sizes[elem.computed_font_size] += len(t)
t = getattr(elem, 'tail', '') t = getattr(elem, 'tail', '')
if t: t = t.strip() if t: t = t.strip()
if t: if t:
parent = elem.getparent() parent = elem.getparent()
if parent.tag not in IGNORE: if parent.tag not in IGNORE:
font_sizes[parent.computed_font_size] += len(t) font_sizes[parent.computed_font_size] += len(t)
try: try:
most_common = max(font_sizes.items(), key=operator.itemgetter(1))[0] most_common = max(font_sizes.items(), key=operator.itemgetter(1))[0]
scale = base/most_common if most_common > 0 else 1. scale = base/most_common if most_common > 0 else 1.
except ValueError: except ValueError:
scale = 1. scale = 1.
# rescale absolute line-heights # rescale absolute line-heights
counter = 0 counter = 0
for sheet in stylesheets: for sheet in stylesheets:
@ -181,17 +181,17 @@ class Rationalizer(object):
if not extra_css.has_key(elem.get('id')): if not extra_css.has_key(elem.get('id')):
extra_css[elem.get('id')] = [] extra_css[elem.get('id')] = []
extra_css[elem.get('id')].append('line-height:%fpt'%(lh*scale)) extra_css[elem.get('id')].append('line-height:%fpt'%(lh*scale))
# Rescale all computed font sizes # Rescale all computed font sizes
for elem in body.iter(etree.Element): for elem in body.iter(etree.Element):
if isinstance(elem, HtmlElement): if isinstance(elem, HtmlElement):
elem.computed_font_size *= scale elem.computed_font_size *= scale
# Remove all font size specifications from the last stylesheet # Remove all font size specifications from the last stylesheet
cls.remove_font_size_information(stylesheets[-1:]) cls.remove_font_size_information(stylesheets[-1:])
# Create the CSS to implement the rescaled font sizes # Create the CSS to implement the rescaled font sizes
for elem in body.iter(etree.Element): for elem in body.iter(etree.Element):
cfs, pcfs = map(operator.attrgetter('computed_font_size'), (elem, elem.getparent())) cfs, pcfs = map(operator.attrgetter('computed_font_size'), (elem, elem.getparent()))
@ -201,12 +201,12 @@ class Rationalizer(object):
if not extra_css.has_key(elem.get('id')): if not extra_css.has_key(elem.get('id')):
extra_css[elem.get('id')] = [] extra_css[elem.get('id')] = []
extra_css[elem.get('id')].append('font-size: %f%%'%(100*(cfs/pcfs))) extra_css[elem.get('id')].append('font-size: %f%%'%(100*(cfs/pcfs)))
css = CSSParser(loglevel=logging.ERROR).parseString('') css = CSSParser(loglevel=logging.ERROR).parseString('')
for id, r in extra_css.items(): for id, r in extra_css.items():
css.add('#%s {%s}'%(id, ';'.join(r))) css.add('#%s {%s}'%(id, ';'.join(r)))
return css return css
@classmethod @classmethod
def rationalize(cls, stylesheets, root, opts): def rationalize(cls, stylesheets, root, opts):
logger = logging.getLogger('html2epub') logger = logging.getLogger('html2epub')
@ -229,7 +229,7 @@ class Rationalizer(object):
################################################################################ ################################################################################
class FontTest(unittest.TestCase): class FontTest(unittest.TestCase):
def setUp(self): def setUp(self):
from calibre.ebooks.epub import config from calibre.ebooks.epub import config
self.opts = config(defaults='').parse() self.opts = config(defaults='').parse()
@ -246,10 +246,10 @@ class FontTest(unittest.TestCase):
<p id="p2">Some other <span class="it">text</span>.</p> <p id="p2">Some other <span class="it">text</span>.</p>
<p id="longest">The longest piece of single font size text in this entire file. Used to test resizing.</p> <p id="longest">The longest piece of single font size text in this entire file. Used to test resizing.</p>
</body> </body>
</html> </html>
''' '''
self.root = fromstring(self.html) self.root = fromstring(self.html)
def do_test(self, css, base=DEFAULT_FONT_SIZE, scale=1): def do_test(self, css, base=DEFAULT_FONT_SIZE, scale=1):
root1 = copy.deepcopy(self.root) root1 = copy.deepcopy(self.root)
root1.computed_font_size = DEFAULT_FONT_SIZE root1.computed_font_size = DEFAULT_FONT_SIZE
@ -262,39 +262,39 @@ class FontTest(unittest.TestCase):
for elem in root2.iter(etree.Element): for elem in root2.iter(etree.Element):
Rationalizer.compute_font_size(elem) Rationalizer.compute_font_size(elem)
for e1, e2 in zip(root1.xpath('//body')[0].iter(etree.Element), root2.xpath('//body')[0].iter(etree.Element)): for e1, e2 in zip(root1.xpath('//body')[0].iter(etree.Element), root2.xpath('//body')[0].iter(etree.Element)):
self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size, self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size,
msg='Computed font sizes for %s not equal. Original: %f Processed: %f'%\ msg='Computed font sizes for %s not equal. Original: %f Processed: %f'%\
(root1.getroottree().getpath(e1), e1.computed_font_size, e2.computed_font_size)) (root1.getroottree().getpath(e1), e1.computed_font_size, e2.computed_font_size))
return stylesheet2.cssText return stylesheet2.cssText
def testStripping(self): def testStripping(self):
'Test that any original entries are removed from the CSS' 'Test that any original entries are removed from the CSS'
css = 'p { font: bold 10px italic smaller; font-size: x-large} \na { font-size: 0 }' css = 'p { font: bold 10px italic smaller; font-size: x-large} \na { font-size: 0 }'
css = CSSParser(loglevel=logging.ERROR).parseString(css) css = CSSParser(loglevel=logging.ERROR).parseString(css)
Rationalizer.compute_font_sizes(copy.deepcopy(self.root), [css]) Rationalizer.compute_font_sizes(copy.deepcopy(self.root), [css])
self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''), self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''),
'p{font:bolditalic}') 'p{font:bolditalic}')
def testIdentity(self): def testIdentity(self):
'Test that no unnecessary font size changes are made' 'Test that no unnecessary font size changes are made'
extra_css = self.do_test('div {font-size:12pt} \nspan {font-size:100%}') extra_css = self.do_test('div {font-size:12pt} \nspan {font-size:100%}')
self.assertEqual(extra_css.strip(), '') self.assertEqual(extra_css.strip(), '')
def testRelativization(self): def testRelativization(self):
'Test conversion of absolute to relative sizes' 'Test conversion of absolute to relative sizes'
self.do_test('#p1 {font: 24pt} b {font: 12pt} .it {font: 48pt} #p2 {font: 100%}') self.do_test('#p1 {font: 24pt} b {font: 12pt} .it {font: 48pt} #p2 {font: 100%}')
def testResizing(self): def testResizing(self):
'Test resizing of fonts' 'Test resizing of fonts'
self.do_test('#longest {font: 24pt} .it {font:20pt; line-height:22pt}') self.do_test('#longest {font: 24pt} .it {font:20pt; line-height:22pt}')
def suite(): def suite():
return unittest.TestLoader().loadTestsFromTestCase(FontTest) return unittest.TestLoader().loadTestsFromTestCase(FontTest)
def test(): def test():
unittest.TextTestRunner(verbosity=2).run(suite()) unittest.TextTestRunner(verbosity=2).run(suite())
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(test()) sys.exit(test())

View File

@ -38,7 +38,7 @@ from lxml.etree import XPath
from lxml import html, etree from lxml import html, etree
from PyQt4.Qt import QApplication, QPixmap from PyQt4.Qt import QApplication, QPixmap
from calibre.ebooks.html import Processor, merge_metadata, get_filelist,\ from calibre.ebooks.html_old import Processor, merge_metadata, get_filelist,\
opf_traverse, create_metadata, rebase_toc, Link, parser opf_traverse, create_metadata, rebase_toc, Link, parser
from calibre.ebooks.epub import config as common_config, tostring from calibre.ebooks.epub import config as common_config, tostring
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory

View File

@ -16,7 +16,7 @@ from calibre.ebooks.epub import config
from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.metadata.opf2 import OPF
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.html import create_dir from calibre.ebooks.html_old import create_dir
from calibre.utils.zipfile import safe_replace, ZipFile from calibre.utils.zipfile import safe_replace, ZipFile
from calibre.utils.config import DynamicConfig from calibre.utils.config import DynamicConfig

View File

@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
Split the flows in an epub file to conform to size limitations. Split the flows in an epub file to conform to size limitations.
''' '''
import os, math, logging, functools, collections, re, copy, sys import os, math, functools, collections, re, copy, sys
from lxml.etree import XPath as _XPath from lxml.etree import XPath as _XPath
from lxml import etree, html from lxml import etree, html
@ -24,16 +24,16 @@ SPLIT_ATTR = 'cs'
SPLIT_POINT_ATTR = 'csp' SPLIT_POINT_ATTR = 'csp'
class SplitError(ValueError): class SplitError(ValueError):
def __init__(self, path, root): def __init__(self, path, root):
size = len(tostring(root))/1024. size = len(tostring(root))/1024.
ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')% ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')%
(os.path.basename(path), size)) (os.path.basename(path), size))
class Splitter(object): class Splitter(object):
def __init__(self, path, opts, stylesheet_map, opf): def __init__(self, path, opts, stylesheet_map, opf):
self.setup_cli_handler(opts.verbose) self.setup_cli_handler(opts.verbose)
self.path = path self.path = path
@ -44,10 +44,10 @@ class Splitter(object):
self.orig_size = os.stat(content(path)).st_size self.orig_size = os.stat(content(path)).st_size
self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.) self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.)
root = html.fromstring(open(content(path)).read()) root = html.fromstring(open(content(path)).read())
self.page_breaks, self.trees = [], [] self.page_breaks, self.trees = [], []
self.split_size = 0 self.split_size = 0
# Split on page breaks # Split on page breaks
self.splitting_on_page_breaks = True self.splitting_on_page_breaks = True
if not opts.dont_split_on_page_breaks: if not opts.dont_split_on_page_breaks:
@ -59,7 +59,7 @@ class Splitter(object):
else: else:
self.trees = [root.getroottree()] self.trees = [root.getroottree()]
trees = list(self.trees) trees = list(self.trees)
# Split any remaining over-sized trees # Split any remaining over-sized trees
self.splitting_on_page_breaks = False self.splitting_on_page_breaks = False
if self.opts.profile.flow_size < sys.maxint: if self.opts.profile.flow_size < sys.maxint:
@ -67,7 +67,7 @@ class Splitter(object):
self.log_info('\tLooking for large trees...') self.log_info('\tLooking for large trees...')
for i, tree in enumerate(list(trees)): for i, tree in enumerate(list(trees)):
self.trees = [] self.trees = []
size = len(tostring(tree.getroot())) size = len(tostring(tree.getroot()))
if size > self.opts.profile.flow_size: if size > self.opts.profile.flow_size:
lt_found = True lt_found = True
try: try:
@ -81,7 +81,7 @@ class Splitter(object):
trees[i:i+1] = list(self.trees) trees[i:i+1] = list(self.trees)
if not lt_found: if not lt_found:
self.log_info('\tNo large trees found') self.log_info('\tNo large trees found')
self.trees = trees self.trees = trees
self.was_split = len(self.trees) > 1 self.was_split = len(self.trees) > 1
if self.was_split: if self.was_split:
@ -91,17 +91,17 @@ class Splitter(object):
for f in self.files: for f in self.files:
self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.) self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
self.fix_opf(opf) self.fix_opf(opf)
self.trees = None self.trees = None
def split_text(self, text, root, size): def split_text(self, text, root, size):
self.log_debug('\t\t\tSplitting text of length: %d'%len(text)) self.log_debug('\t\t\tSplitting text of length: %d'%len(text))
rest = text.replace('\r', '') rest = text.replace('\r', '')
parts = re.split('\n\n', rest) parts = re.split('\n\n', rest)
self.log_debug('\t\t\t\tFound %d parts'%len(parts)) self.log_debug('\t\t\t\tFound %d parts'%len(parts))
if max(map(len, parts)) > size: if max(map(len, parts)) > size:
raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root) raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root)
ans = [] ans = []
buf = '' buf = ''
for part in parts: for part in parts:
@ -111,8 +111,8 @@ class Splitter(object):
ans.append(buf) ans.append(buf)
buf = part buf = part
return ans return ans
def split_to_size(self, tree): def split_to_size(self, tree):
self.log_debug('\t\tSplitting...') self.log_debug('\t\tSplitting...')
root = tree.getroot() root = tree.getroot()
@ -134,7 +134,7 @@ class Splitter(object):
p = pre.getparent() p = pre.getparent()
i = p.index(pre) i = p.index(pre)
p[i:i+1] = new_pres p[i:i+1] = new_pres
split_point, before = self.find_split_point(root) split_point, before = self.find_split_point(root)
if split_point is None or self.split_size > 6*self.orig_size: if split_point is None or self.split_size > 6*self.orig_size:
if not self.always_remove: if not self.always_remove:
@ -142,7 +142,7 @@ class Splitter(object):
'structure preservation. This may cause ' 'structure preservation. This may cause '
'incorrect rendering.')) 'incorrect rendering.'))
raise SplitError(self.path, root) raise SplitError(self.path, root)
for t in self.do_split(tree, split_point, before): for t in self.do_split(tree, split_point, before):
r = t.getroot() r = t.getroot()
if self.is_page_empty(r): if self.is_page_empty(r):
@ -151,12 +151,12 @@ class Splitter(object):
if size <= self.opts.profile.flow_size: if size <= self.opts.profile.flow_size:
self.trees.append(t) self.trees.append(t)
#print tostring(t.getroot(), pretty_print=True) #print tostring(t.getroot(), pretty_print=True)
self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)', self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)',
len(self.trees), size/1024.) len(self.trees), size/1024.)
self.split_size += size self.split_size += size
else: else:
self.split_to_size(t) self.split_to_size(t)
def is_page_empty(self, root): def is_page_empty(self, root):
body = root.find('body') body = root.find('body')
if body is None: if body is None:
@ -170,14 +170,14 @@ class Splitter(object):
if img.get('style', '') != 'display:none': if img.get('style', '') != 'display:none':
return False return False
return True return True
def do_split(self, tree, split_point, before): def do_split(self, tree, split_point, before):
''' '''
Split ``tree`` into a *before* and *after* tree at ``split_point``, Split ``tree`` into a *before* and *after* tree at ``split_point``,
preserving tag structure, but not duplicating any text. preserving tag structure, but not duplicating any text.
All tags that have had their text and tail All tags that have had their text and tail
removed have the attribute ``calibre_split`` set to 1. removed have the attribute ``calibre_split`` set to 1.
:param before: If True tree is split before split_point, otherwise after split_point :param before: If True tree is split before split_point, otherwise after split_point
:return: before_tree, after_tree :return: before_tree, after_tree
''' '''
@ -188,7 +188,7 @@ class Splitter(object):
body, body2 = root.body, root2.body body, body2 = root.body, root2.body
split_point = root.xpath(path)[0] split_point = root.xpath(path)[0]
split_point2 = root2.xpath(path)[0] split_point2 = root2.xpath(path)[0]
def nix_element(elem, top=True): def nix_element(elem, top=True):
if self.always_remove: if self.always_remove:
parent = elem.getparent() parent = elem.getparent()
@ -198,18 +198,18 @@ class Splitter(object):
else: else:
index = parent.index(elem) index = parent.index(elem)
parent[index:index+1] = list(elem.iterchildren()) parent[index:index+1] = list(elem.iterchildren())
else: else:
elem.text = u'' elem.text = u''
elem.tail = u'' elem.tail = u''
elem.set(SPLIT_ATTR, '1') elem.set(SPLIT_ATTR, '1')
if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']: if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']:
elem.set('style', 'display:none') elem.set('style', 'display:none')
def fix_split_point(sp): def fix_split_point(sp):
if not self.splitting_on_page_breaks: if not self.splitting_on_page_breaks:
sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid') sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid')
# Tree 1 # Tree 1
hit_split_point = False hit_split_point = False
for elem in list(body.iterdescendants(etree.Element)): for elem in list(body.iterdescendants(etree.Element)):
@ -223,8 +223,8 @@ class Splitter(object):
continue continue
if hit_split_point: if hit_split_point:
nix_element(elem) nix_element(elem)
# Tree 2 # Tree 2
hit_split_point = False hit_split_point = False
for elem in list(body2.iterdescendants(etree.Element)): for elem in list(body2.iterdescendants(etree.Element)):
@ -238,17 +238,17 @@ class Splitter(object):
continue continue
if not hit_split_point: if not hit_split_point:
nix_element(elem, top=False) nix_element(elem, top=False)
return tree, tree2 return tree, tree2
def split_on_page_breaks(self, orig_tree): def split_on_page_breaks(self, orig_tree):
ordered_ids = [] ordered_ids = []
for elem in orig_tree.xpath('//*[@id]'): for elem in orig_tree.xpath('//*[@id]'):
id = elem.get('id') id = elem.get('id')
if id in self.page_break_ids: if id in self.page_break_ids:
ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)]) ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
self.trees = [] self.trees = []
tree = orig_tree tree = orig_tree
for pattern, before in ordered_ids: for pattern, before in ordered_ids:
@ -260,13 +260,13 @@ class Splitter(object):
tree = after tree = after
self.trees.append(tree) self.trees.append(tree)
self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())] self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
def find_page_breaks(self, stylesheets, root): def find_page_breaks(self, stylesheets, root):
''' '''
Find all elements that have either page-break-before or page-break-after set. Find all elements that have either page-break-before or page-break-after set.
Populates `self.page_breaks` with id based XPath selectors (for elements that don't Populates `self.page_breaks` with id based XPath selectors (for elements that don't
have ids, an id is created). have ids, an id is created).
''' '''
page_break_selectors = set([]) page_break_selectors = set([])
@ -283,16 +283,16 @@ class Splitter(object):
page_break_selectors.add((CSSSelector(rule.selectorText), False)) page_break_selectors.add((CSSSelector(rule.selectorText), False))
except: except:
pass pass
page_breaks = set([]) page_breaks = set([])
for selector, before in page_break_selectors: for selector, before in page_break_selectors:
for elem in selector(root): for elem in selector(root):
elem.pb_before = before elem.pb_before = before
page_breaks.add(elem) page_breaks.add(elem)
for i, elem in enumerate(root.iter()): for i, elem in enumerate(root.iter()):
elem.pb_order = i elem.pb_order = i
page_breaks = list(page_breaks) page_breaks = list(page_breaks)
page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order)) page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
self.page_break_ids = [] self.page_break_ids = []
@ -300,12 +300,12 @@ class Splitter(object):
x.set('id', x.get('id', 'calibre_pb_%d'%i)) x.set('id', x.get('id', 'calibre_pb_%d'%i))
id = x.get('id') id = x.get('id')
self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before)) self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before))
self.page_break_ids.append(id) self.page_break_ids.append(id)
def find_split_point(self, root): def find_split_point(self, root):
''' '''
Find the tag at which to split the tree rooted at `root`. Find the tag at which to split the tree rooted at `root`.
Search order is: Search order is:
* Heading tags * Heading tags
* <div> tags * <div> tags
@ -314,7 +314,7 @@ class Splitter(object):
* <p> tags * <p> tags
* <br> tags * <br> tags
* <li> tags * <li> tags
We try to split in the "middle" of the file (as defined by tag counts. We try to split in the "middle" of the file (as defined by tag counts.
''' '''
def pick_elem(elems): def pick_elem(elems):
@ -325,18 +325,18 @@ class Splitter(object):
i = int(math.floor(len(elems)/2.)) i = int(math.floor(len(elems)/2.))
elems[i].set(SPLIT_POINT_ATTR, '1') elems[i].set(SPLIT_POINT_ATTR, '1')
return elems[i] return elems[i]
for path in ( for path in (
'//*[re:match(name(), "h[1-6]", "i")]', '//*[re:match(name(), "h[1-6]", "i")]',
'/html/body/div', '/html/body/div',
'//pre', '//pre',
'//hr', '//hr',
'//p', '//p',
'//div', '//div',
'//br', '//br',
'//li', '//li',
): ):
elems = root.xpath(path, elems = root.xpath(path,
namespaces={'re':'http://exslt.org/regular-expressions'}) namespaces={'re':'http://exslt.org/regular-expressions'})
elem = pick_elem(elems) elem = pick_elem(elems)
if elem is not None: if elem is not None:
@ -345,9 +345,9 @@ class Splitter(object):
except: except:
continue continue
return elem, True return elem, True
return None, True return None, True
def commit(self): def commit(self):
''' '''
Commit all changes caused by the split. This removes the previously Commit all changes caused by the split. This removes the previously
@ -357,7 +357,7 @@ class Splitter(object):
''' '''
self.anchor_map = collections.defaultdict(lambda :self.base%0) self.anchor_map = collections.defaultdict(lambda :self.base%0)
self.files = [] self.files = []
for i, tree in enumerate(self.trees): for i, tree in enumerate(self.trees):
root = tree.getroot() root = tree.getroot()
self.files.append(self.base%i) self.files.append(self.base%i)
@ -367,7 +367,7 @@ class Splitter(object):
for elem in root.xpath('//*[@%s or @%s]'%(SPLIT_ATTR, SPLIT_POINT_ATTR)): for elem in root.xpath('//*[@%s or @%s]'%(SPLIT_ATTR, SPLIT_POINT_ATTR)):
elem.attrib.pop(SPLIT_ATTR, None) elem.attrib.pop(SPLIT_ATTR, None)
elem.attrib.pop(SPLIT_POINT_ATTR, '0') elem.attrib.pop(SPLIT_POINT_ATTR, '0')
for current, tree in zip(self.files, self.trees): for current, tree in zip(self.files, self.trees):
for a in tree.getroot().xpath('//a[@href]'): for a in tree.getroot().xpath('//a[@href]'):
href = a.get('href').strip() href = a.get('href').strip()
@ -375,10 +375,10 @@ class Splitter(object):
anchor = href[1:] anchor = href[1:]
file = self.anchor_map[anchor] file = self.anchor_map[anchor]
if file != current: if file != current:
a.set('href', file+href) a.set('href', file+href)
open(content(current), 'wb').\ open(content(current), 'wb').\
write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print)) write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print))
os.remove(content(self.path)) os.remove(content(self.path))
@ -391,12 +391,12 @@ class Splitter(object):
id_map = {} id_map = {}
for item in items: for item in items:
id_map[item.get('id')] = opf.replace_manifest_item(item, new_items) id_map[item.get('id')] = opf.replace_manifest_item(item, new_items)
for id in id_map.keys(): for id in id_map.keys():
opf.replace_spine_items_by_idref(id, id_map[id]) opf.replace_spine_items_by_idref(id, id_map[id])
for ref in opf.iterguide(): for ref in opf.iterguide():
href = ref.get('href', '') href = ref.get('href', '')
if href.startswith('content/'+self.path): if href.startswith('content/'+self.path):
href = href.split('#') href = href.split('#')
frag = None frag = None
@ -408,8 +408,8 @@ class Splitter(object):
new_file = self.anchor_map[frag] new_file = self.anchor_map[frag]
ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag))) ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag)))
def fix_content_links(html_files, changes, opts): def fix_content_links(html_files, changes, opts):
split_files = [f.path for f in changes] split_files = [f.path for f in changes]
anchor_maps = [f.anchor_map for f in changes] anchor_maps = [f.anchor_map for f in changes]
@ -420,7 +420,7 @@ def fix_content_links(html_files, changes, opts):
files[i:i+1] = changes[j].files files[i:i+1] = changes[j].files
except ValueError: except ValueError:
continue continue
for htmlfile in files: for htmlfile in files:
changed = False changed = False
root = html.fromstring(open(content(htmlfile), 'rb').read()) root = html.fromstring(open(content(htmlfile), 'rb').read())
@ -439,7 +439,7 @@ def fix_content_links(html_files, changes, opts):
frag = ('#'+anchor) if anchor else '' frag = ('#'+anchor) if anchor else ''
a.set('href', newf+frag) a.set('href', newf+frag)
changed = True changed = True
if changed: if changed:
open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print)) open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
@ -448,7 +448,7 @@ def fix_ncx(path, changes):
anchor_maps = [f.anchor_map for f in changes] anchor_maps = [f.anchor_map for f in changes]
tree = etree.parse(path) tree = etree.parse(path)
changed = False changed = False
for content in tree.getroot().xpath('//x:content[@src]', for content in tree.getroot().xpath('//x:content[@src]',
namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}): namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}):
href = content.get('src') href = content.get('src')
if not href.startswith('#'): if not href.startswith('#'):
@ -481,21 +481,21 @@ def find_html_files(opf):
if os.path.exists(content(f)): if os.path.exists(content(f)):
html_files.append(f) html_files.append(f)
return html_files return html_files
def split(pathtoopf, opts, stylesheet_map): def split(pathtoopf, opts, stylesheet_map):
pathtoopf = os.path.abspath(pathtoopf) pathtoopf = os.path.abspath(pathtoopf)
opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf)) opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
with CurrentDir(os.path.dirname(pathtoopf)): with CurrentDir(os.path.dirname(pathtoopf)):
html_files = find_html_files(opf) html_files = find_html_files(opf)
changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files] changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files]
changes = [c for c in changes if c.was_split] changes = [c for c in changes if c.was_split]
fix_content_links(html_files, changes, opts) fix_content_links(html_files, changes, opts)
for item in opf.itermanifest(): for item in opf.itermanifest():
if item.get('media-type', '') == 'application/x-dtbncx+xml': if item.get('media-type', '') == 'application/x-dtbncx+xml':
fix_ncx(item.get('href'), changes) fix_ncx(item.get('href'), changes)
break break
open(pathtoopf, 'wb').write(opf.render()) open(pathtoopf, 'wb').write(opf.render())

View File

@ -0,0 +1,30 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
from lxml.etree import tostring as _tostring
def tostring(root, strip_comments=False, pretty_print=False):
'''
Serialize processed XHTML.
'''
root.set('xmlns', 'http://www.w3.org/1999/xhtml')
root.set('{http://www.w3.org/1999/xhtml}xlink', 'http://www.w3.org/1999/xlink')
for x in root.iter():
if x.tag.rpartition('}')[-1].lower() == 'svg':
x.set('xmlns', 'http://www.w3.org/2000/svg')
ans = _tostring(root, encoding='utf-8', pretty_print=pretty_print)
if strip_comments:
ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans)
ans = '<?xml version="1.0" encoding="utf-8" ?>\n'+ans
return ans

View File

@ -0,0 +1,342 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'''
Input plugin for HTML or OPF ebooks.
'''
import os, re, sys, cStringIO
from urlparse import urlparse, urlunparse
from urllib import unquote
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.chardet import xml_to_unicode
from calibre.customize.conversion import OptionRecommendation
from calibre import unicode_path
class Link(object):
'''
Represents a link in a HTML file.
'''
@classmethod
def url_to_local_path(cls, url, base):
path = urlunparse(('', '', url.path, url.params, url.query, ''))
path = unquote(path)
if os.path.isabs(path):
return path
return os.path.abspath(os.path.join(base, path))
def __init__(self, url, base):
'''
:param url: The url this link points to. Must be an unquoted unicode string.
:param base: The base directory that relative URLs are with respect to.
Must be a unicode string.
'''
assert isinstance(url, unicode) and isinstance(base, unicode)
self.url = url
self.parsed_url = urlparse(self.url)
self.is_local = self.parsed_url.scheme in ('', 'file')
self.is_internal = self.is_local and not bool(self.parsed_url.path)
self.path = None
self.fragment = unquote(self.parsed_url.fragment)
if self.is_local and not self.is_internal:
self.path = self.url_to_local_path(self.parsed_url, base)
def __hash__(self):
if self.path is None:
return hash(self.url)
return hash(self.path)
def __eq__(self, other):
return self.path == getattr(other, 'path', other)
def __str__(self):
return u'Link: %s --> %s'%(self.url, self.path)
class IgnoreFile(Exception):
def __init__(self, msg, errno):
Exception.__init__(self, msg)
self.doesnt_exist = errno == 2
self.errno = errno
class HTMLFile(object):
'''
Contains basic information about an HTML file. This
includes a list of links to other files as well as
the encoding of each file. Also tries to detect if the file is not a HTML
file in which case :member:`is_binary` is set to True.
The encoding of the file is available as :member:`encoding`.
'''
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
LINK_PAT = re.compile(
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
re.DOTALL|re.IGNORECASE)
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
'''
:param level: The level of this file. Should be 0 for the root file.
:param encoding: Use `encoding` to decode HTML.
:param referrer: The :class:`HTMLFile` that first refers to this file.
'''
self.path = unicode_path(path_to_html_file, abs=True)
self.title = os.path.splitext(os.path.basename(self.path))[0]
self.base = os.path.dirname(self.path)
self.level = level
self.referrer = referrer
self.links = []
try:
with open(self.path, 'rb') as f:
src = f.read()
except IOError, err:
msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
if level == 0:
raise IOError(msg)
raise IgnoreFile(msg, err.errno)
self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
if not self.is_binary:
if encoding is None:
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
self.encoding = encoding
else:
self.encoding = encoding
src = src.decode(encoding, 'replace')
match = self.TITLE_PAT.search(src)
self.title = match.group(1) if match is not None else self.title
self.find_links(src)
def __eq__(self, other):
return self.path == getattr(other, 'path', other)
def __str__(self):
return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
def __repr__(self):
return str(self)
def find_links(self, src):
for match in self.LINK_PAT.finditer(src):
url = None
for i in ('url1', 'url2', 'url3'):
url = match.group(i)
if url:
break
link = self.resolve(url)
if link not in self.links:
self.links.append(link)
def resolve(self, url):
return Link(url, self.base)
def depth_first(root, flat, visited=set([])):
yield root
visited.add(root)
for link in root.links:
if link.path is not None and link not in visited:
try:
index = flat.index(link)
except ValueError: # Can happen if max_levels is used
continue
hf = flat[index]
if hf not in visited:
yield hf
visited.add(hf)
for hf in depth_first(hf, flat, visited):
if hf not in visited:
yield hf
visited.add(hf)
def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
'''
Recursively traverse all links in the HTML file.
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
implies that no links in the root HTML file are followed.
:param encoding: Specify character encoding of HTML files. If `None` it is
auto-detected.
:return: A pair of lists (breadth_first, depth_first). Each list contains
:class:`HTMLFile` objects.
'''
assert max_levels >= 0
level = 0
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
next_level = list(flat)
while level < max_levels and len(next_level) > 0:
level += 1
nl = []
for hf in next_level:
rejects = []
for link in hf.links:
if link.path is None or link.path in flat:
continue
try:
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
if nf.is_binary:
raise IgnoreFile('%s is a binary file'%nf.path, -1)
nl.append(nf)
flat.append(nf)
except IgnoreFile, err:
rejects.append(link)
if not err.doesnt_exist or verbose > 1:
print repr(err)
for link in rejects:
hf.links.remove(link)
next_level = list(nl)
orec = sys.getrecursionlimit()
sys.setrecursionlimit(500000)
try:
return flat, list(depth_first(flat[0], flat))
finally:
sys.setrecursionlimit(orec)
def opf_traverse(opf_reader, verbose=0, encoding=None):
'''
Return a list of :class:`HTMLFile` objects in the order specified by the
`<spine>` element of the OPF.
:param opf_reader: An :class:`calibre.ebooks.metadata.opf2.OPF` instance.
:param encoding: Specify character encoding of HTML files. If `None` it is
auto-detected.
'''
if not opf_reader.spine:
raise ValueError('OPF does not have a spine')
flat = []
for path in opf_reader.spine.items():
path = os.path.abspath(path)
if path not in flat:
flat.append(os.path.abspath(path))
for item in opf_reader.manifest:
if 'html' in item.mime_type:
path = os.path.abspath(item.path)
if path not in flat:
flat.append(path)
for i, path in enumerate(flat):
if not os.path.exists(path):
path = path.replace('&', '%26')
if os.path.exists(path):
flat[i] = path
for item in opf_reader.itermanifest():
item.set('href', item.get('href').replace('&', '%26'))
ans = []
for path in flat:
if os.path.exists(path):
ans.append(HTMLFile(path, 0, encoding, verbose))
else:
print 'WARNING: OPF spine item %s does not exist'%path
ans = [f for f in ans if not f.is_binary]
return ans
def search_for_opf(dir):
for f in os.listdir(dir):
if f.lower().endswith('.opf'):
return OPF(open(os.path.join(dir, f), 'rb'), dir)
def get_filelist(htmlfile, dir, opts, log):
'''
Build list of files referenced by html file or try to detect and use an
OPF file instead.
'''
print 'Building file list...'
opf = search_for_opf(dir)
filelist = None
if opf is not None:
try:
filelist = opf_traverse(opf, verbose=opts.verbose,
encoding=opts.input_encoding)
except:
pass
if not filelist:
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
verbose=opts.verbose,
encoding=opts.input_encoding)\
[0 if opts.breadth_first else 1]
if opts.verbose:
log.debug('\tFound files...')
for f in filelist:
log.debug('\t\t', f)
return opf, filelist
class HTMLInput(InputFormatPlugin):
name = 'HTML Input'
author = 'Kovid Goyal'
description = 'Convert HTML and OPF files to an OEB'
file_types = set(['opf', 'html', 'htm', 'xhtml', 'xhtm'])
options = set([
OptionRecommendation(name='breadth_first',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Traverse links in HTML files breadth first. Normally, '
'they are traversed depth first.'
)
),
OptionRecommendation(name='max_levels',
recommended_value=5, level=OptionRecommendation.LOW,
help=_('Maximum levels of recursion when following links in '
'HTML files. Must be non-negative. 0 implies that no '
'links in the root HTML file are followed. Default is '
'%default.'
)
),
])
def convert(self, stream, opts, file_ext, log,
accelerators):
basedir = os.getcwd()
if hasattr(stream, 'name'):
basedir = os.path.dirname(stream.name)
if file_ext == 'opf':
opf = OPF(stream, basedir)
filelist = opf_traverse(opf, verbose=opts.verbose,
encoding=opts.input_encoding)
mi = MetaInformation(opf)
else:
opf, filelist = get_filelist(stream.name, basedir, opts, log)
mi = MetaInformation(opf)
mi.smart_update(get_metadata(stream, 'html'))
mi = OPFCreator(os.getcwdu(), mi)
mi.guide = None
entries = [(f.path, 'application/xhtml+xml') for f in filelist]
mi.create_manifest(entries)
mi.create_spine([f.path for f in filelist])
tocbuf = cStringIO.StringIO()
mi.render(open('metadata.opf', 'wb'), tocbuf, 'toc.ncx')
toc = tocbuf.getvalue()
if toc:
open('toc.ncx', 'wb').write(toc)
from calibre.ebooks.conversion.plumber import create_oebbook
return create_oebbook(log, os.path.abspath('metadata.opf'))

View File

@ -683,26 +683,6 @@ class OPF(object):
return property(fget=fget, fset=fset) return property(fget=fget, fset=fset)
@dynamic_property
def title_sort(self):
def fget(self):
matches = self.title_path(self.metadata)
if matches:
for match in matches:
ans = match.get('{%s}file-as'%self.NAMESPACES['opf'], None)
if not ans:
ans = match.get('file-as', None)
if ans:
return ans
def fset(self, val):
matches = self.title_path(self.metadata)
if matches:
matches[0].set('file-as', unicode(val))
return property(fget=fget, fset=fset)
@dynamic_property @dynamic_property
def tags(self): def tags(self):
@ -943,9 +923,10 @@ class OPFCreator(MetaInformation):
from calibre.resources import opf_template from calibre.resources import opf_template
from calibre.utils.genshi.template import MarkupTemplate from calibre.utils.genshi.template import MarkupTemplate
template = MarkupTemplate(opf_template) template = MarkupTemplate(opf_template)
toc = getattr(self, 'toc', None)
if self.manifest: if self.manifest:
self.manifest.set_basedir(self.base_path) self.manifest.set_basedir(self.base_path)
if ncx_manifest_entry is not None: if ncx_manifest_entry is not None and toc is not None:
if not os.path.isabs(ncx_manifest_entry): if not os.path.isabs(ncx_manifest_entry):
ncx_manifest_entry = os.path.join(self.base_path, ncx_manifest_entry) ncx_manifest_entry = os.path.join(self.base_path, ncx_manifest_entry)
remove = [i for i in self.manifest if i.id == 'ncx'] remove = [i for i in self.manifest if i.id == 'ncx']
@ -965,7 +946,6 @@ class OPFCreator(MetaInformation):
opf = template.generate(__appname__=__appname__, mi=self, __version__=__version__).render('xml') opf = template.generate(__appname__=__appname__, mi=self, __version__=__version__).render('xml')
opf_stream.write(opf) opf_stream.write(opf)
opf_stream.flush() opf_stream.flush()
toc = getattr(self, 'toc', None)
if toc is not None and ncx_stream is not None: if toc is not None and ncx_stream is not None:
toc.render(ncx_stream, self.application_id) toc.render(ncx_stream, self.application_id)
ncx_stream.flush() ncx_stream.flush()
@ -1030,19 +1010,8 @@ class OPFTest(unittest.TestCase):
self.opf.smart_update(MetaInformation(self.opf)) self.opf.smart_update(MetaInformation(self.opf))
self.testReading() self.testReading()
def testCreator(self):
opf = OPFCreator(os.getcwd(), self.opf)
buf = cStringIO.StringIO()
opf.render(buf)
raw = buf.getvalue()
self.testReading(opf=OPF(cStringIO.StringIO(raw), os.getcwd()))
def testSmartUpdate(self):
self.opf.smart_update(self.opf)
self.testReading()
def suite(): def suite():
return unittest.TestLoader().loadTestsFromTestCase(OPFTest) return unittest.TestLoader().loadTestsFromTestCase(OPFTest)
def test(): def test():
unittest.TextTestRunner(verbosity=2).run(suite()) unittest.TextTestRunner(verbosity=2).run(suite())

View File

@ -29,5 +29,5 @@ class MOBIInput(InputFormatPlugin):
with open(f, 'wb') as q: with open(f, 'wb') as q:
q.write(html.tostring(root, encoding='utf-8', method='xml', q.write(html.tostring(root, encoding='utf-8', method='xml',
include_meta_content_type=False)) include_meta_content_type=False))
accelerators['pagebreaks'] = {f: '//div[@class="mbp_pagebreak"]'} accelerators['pagebreaks'] = {f: '//*[@class="mbp_pagebreak"]'}
return mr.created_opf_path return mr.created_opf_path

View File

@ -522,7 +522,7 @@ class MobiReader(object):
else: else:
raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type)) raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type))
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower(): if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
self.mobi_html = self.mobi_html.replace('\r ', '\n\n ') self.mobi_html = self.mobi_html.replace('\r ', '\n\n').replace('\0', '')
return processed_records return processed_records

View File

@ -151,7 +151,7 @@ def resolve_base_href(root):
return return
make_links_absolute(root, base_href, resolve_base_href=False) make_links_absolute(root, base_href, resolve_base_href=False)
def rewrite_links(root, link_repl_func, resolve_base_href=True): def rewrite_links(root, link_repl_func, resolve_base_href=False):
''' '''
Rewrite all the links in the document. For each link Rewrite all the links in the document. For each link
``link_repl_func(link)`` will be called, and the return value ``link_repl_func(link)`` will be called, and the return value

View File

@ -6,9 +6,16 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, shutil import os
from urllib import unquote as urlunquote
from functools import partial
from calibre.ebooks.oeb.base import OEB_DOCS from lxml import etree
import cssutils
from calibre.constants import islinux
from calibre.ebooks.oeb.base import OEB_DOCS, urlnormalize, urldefrag, \
rewrite_links
class Package(object): class Package(object):
@ -29,18 +36,69 @@ class Package(object):
self.new_base_path = os.path.abspath(base) self.new_base_path = os.path.abspath(base)
def rewrite_links_in(self, item): def rewrite_links_in(self, item):
new_items = [] base = os.path.join(self.new_base_path, *item.href.split('/'))
return new_items base = os.path.dirname(base)
if etree.iselement(item.data):
self.rewrite_links_in_xml(item.data, base)
elif hasattr(item.data, 'cssText'):
self.rewrite_links_in_css(item.data, base)
def link_replacer(self, link_, base=''):
link = urlnormalize(link_)
link, frag = urldefrag(link)
link = urlunquote(link).replace('/', os.sep)
if base and not os.path.isabs(link):
link = os.path.join(base, link)
link = os.path.abspath(link)
if not islinux:
link = link.lower()
if link not in self.map:
return link_
nlink = os.path.relpath(self.map[link], base)
if frag:
nlink = '#'.join(nlink, frag)
return nlink.replace(os.sep, '/')
def rewrite_links_in_css(self, sheet, base):
repl = partial(self.link_replacer, base=base)
cssutils.replaceUrls(sheet, repl)
def rewrite_links_in_xml(self, root, base):
repl = partial(self.link_replacer, base=base)
rewrite_links(root, repl)
def move_manifest_item(self, item): def move_manifest_item(self, item):
item.data # Make sure the data has been loaded and cached item.data # Make sure the data has been loaded and cached
old_abspath = os.path.join(self.old_base_path, *item.href.split('/')) old_abspath = os.path.join(self.old_base_path,
bname = item.href.split('/')[-1] *(urldefrag(item.href)[0].split('/')))
new_href = 'content/' + \ old_abspath = os.path.abspath(old_abspath)
('resources/' if item.media_type in OEB_DOCS else '')+bname bname = item.href.split('/')[-1].partition('#')[0]
new_href = 'content/resources/'
if item.media_type in OEB_DOCS:
new_href = 'content/'
elif item.href.lower().endswith('.ncx'):
new_href = ''
new_href += bname
new_abspath = os.path.join(self.new_base_path, *new_href.split('/'))
new_abspath = os.path.abspath(new_abspath)
item.href = new_href
if not islinux:
old_abspath, new_abspath = old_abspath.lower(), new_abspath.lower()
if old_abspath != new_abspath:
self.map[old_abspath] = new_abspath
def rewrite_links_in_toc(self, toc):
if toc.href:
toc.href = self.link_replacer(toc.href, base=self.new_base_path)
for x in toc:
self.rewrite_links_in_toc(x)
def __call__(self, oeb, context): def __call__(self, oeb, context):
self.map = {} self.map = {}
self.log = self.oeb.log
self.old_base_path = os.path.abspath(oeb.container.rootdir) self.old_base_path = os.path.abspath(oeb.container.rootdir)
for item in self.oeb.manifest: for item in self.oeb.manifest:
@ -49,4 +107,9 @@ class Package(object):
for item in self.oeb.manifest: for item in self.oeb.manifest:
self.rewrite_links_in(item) self.rewrite_links_in(item)
if getattr(oeb.toc, 'nodes', False):
self.rewrite_links_in_toc(oeb.toc)
if hasattr(oeb, 'guide'):
for ref in oeb.guide.values():
ref.href = self.link_replacer(ref.href, base=self.new_base_path)

View File

@ -6,11 +6,12 @@ from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
from itertools import chain
from urlparse import urldefrag from urlparse import urldefrag
import cssutils
from calibre.ebooks.oeb.base import CSS_MIME, OEB_DOCS from calibre.ebooks.oeb.base import CSS_MIME, OEB_DOCS
from calibre.ebooks.oeb.base import LINK_SELECTORS, CSSURL_RE from calibre.ebooks.oeb.base import urlnormalize, iterlinks
from calibre.ebooks.oeb.base import urlnormalize
class ManifestTrimmer(object): class ManifestTrimmer(object):
@classmethod @classmethod
@ -44,16 +45,15 @@ class ManifestTrimmer(object):
if (item.media_type in OEB_DOCS or if (item.media_type in OEB_DOCS or
item.media_type[-4:] in ('/xml', '+xml')) and \ item.media_type[-4:] in ('/xml', '+xml')) and \
item.data is not None: item.data is not None:
hrefs = [sel(item.data) for sel in LINK_SELECTORS] hrefs = [r[2] for r in iterlinks(item.data)]
for href in chain(*hrefs): for href in hrefs:
href = item.abshref(urlnormalize(href)) href = item.abshref(urlnormalize(href))
if href in oeb.manifest.hrefs: if href in oeb.manifest.hrefs:
found = oeb.manifest.hrefs[href] found = oeb.manifest.hrefs[href]
if found not in used: if found not in used:
new.add(found) new.add(found)
elif item.media_type == CSS_MIME: elif item.media_type == CSS_MIME:
for match in CSSURL_RE.finditer(item.data.cssText): for href in cssutils.getUrls(item.data):
href = match.group('url')
href = item.abshref(urlnormalize(href)) href = item.abshref(urlnormalize(href))
if href in oeb.manifest.hrefs: if href in oeb.manifest.hrefs:
found = oeb.manifest.hrefs[href] found = oeb.manifest.hrefs[href]

View File

@ -22,9 +22,6 @@ entry_points = {
'web2disk = calibre.web.fetch.simple:main', 'web2disk = calibre.web.fetch.simple:main',
'feeds2disk = calibre.web.feeds.main:main', 'feeds2disk = calibre.web.feeds.main:main',
'calibre-server = calibre.library.server:main', 'calibre-server = calibre.library.server:main',
'feeds2lrf = calibre.ebooks.lrf.feeds.convert_from:main',
'feeds2epub = calibre.ebooks.epub.from_feeds:main',
'feeds2mobi = calibre.ebooks.mobi.from_feeds:main',
'web2lrf = calibre.ebooks.lrf.web.convert_from:main', 'web2lrf = calibre.ebooks.lrf.web.convert_from:main',
'lrf2lrs = calibre.ebooks.lrf.lrfparser:main', 'lrf2lrs = calibre.ebooks.lrf.lrfparser:main',
'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main', 'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main',
@ -154,10 +151,7 @@ def setup_completion(fatal_errors):
from calibre.ebooks.lrf.pdf.reflow import option_parser as pdfhtmlop from calibre.ebooks.lrf.pdf.reflow import option_parser as pdfhtmlop
from calibre.web.feeds.main import option_parser as feeds2disk from calibre.web.feeds.main import option_parser as feeds2disk
from calibre.web.feeds.recipes import titles as feed_titles from calibre.web.feeds.recipes import titles as feed_titles
from calibre.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf
from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop
from calibre.ebooks.epub.from_feeds import option_parser as feeds2epub
from calibre.ebooks.mobi.from_feeds import option_parser as feeds2mobi
from calibre.ebooks.epub.from_comic import option_parser as comic2epub from calibre.ebooks.epub.from_comic import option_parser as comic2epub
from calibre.ebooks.metadata.fetch import option_parser as fem_op from calibre.ebooks.metadata.fetch import option_parser as fem_op
from calibre.gui2.main import option_parser as guiop from calibre.gui2.main import option_parser as guiop
@ -192,9 +186,6 @@ def setup_completion(fatal_errors):
f.write(opts_and_exts('comic2mobi', comic2epub, ['cbz', 'cbr'])) f.write(opts_and_exts('comic2mobi', comic2epub, ['cbz', 'cbr']))
f.write(opts_and_exts('comic2pdf', comic2epub, ['cbz', 'cbr'])) f.write(opts_and_exts('comic2pdf', comic2epub, ['cbz', 'cbr']))
f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles)) f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
f.write(opts_and_words('feeds2epub', feeds2epub, feed_titles))
f.write(opts_and_words('feeds2mobi', feeds2mobi, feed_titles))
f.write(opts_and_words('fetch-ebook-metadata', fem_op, [])) f.write(opts_and_words('fetch-ebook-metadata', fem_op, []))
f.write(opts_and_words('calibre-smtp', smtp_op, [])) f.write(opts_and_words('calibre-smtp', smtp_op, []))
f.write(''' f.write('''