mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Sync to pluginize
This commit is contained in:
commit
8fd446090c
@ -122,8 +122,9 @@ class InputFormatPlugin(Plugin):
|
|||||||
def convert(self, stream, options, file_ext, log, accelerators):
|
def convert(self, stream, options, file_ext, log, accelerators):
|
||||||
'''
|
'''
|
||||||
This method must be implemented in sub-classes. It must return
|
This method must be implemented in sub-classes. It must return
|
||||||
the path to the created OPF file. All output should be contained in
|
the path to the created OPF file or an :class:`OEBBook` instance.
|
||||||
the current directory. If this plugin creates files outside the current
|
All output should be contained in the current directory.
|
||||||
|
If this plugin creates files outside the current
|
||||||
directory they must be deleted/marked for deletion before this method
|
directory they must be deleted/marked for deletion before this method
|
||||||
returns.
|
returns.
|
||||||
|
|
||||||
|
@ -299,21 +299,15 @@ OptionRecommendation(name='language',
|
|||||||
|
|
||||||
# Create an OEBBook from the input file. The input plugin does all the
|
# Create an OEBBook from the input file. The input plugin does all the
|
||||||
# heavy lifting.
|
# heavy lifting.
|
||||||
from calibre.ebooks.oeb.reader import OEBReader
|
|
||||||
from calibre.ebooks.oeb.base import OEBBook
|
|
||||||
accelerators = {}
|
accelerators = {}
|
||||||
|
|
||||||
tdir = PersistentTemporaryDirectory('_plumber')
|
tdir = PersistentTemporaryDirectory('_plumber')
|
||||||
|
|
||||||
opfpath = self.input_plugin(open(self.input, 'rb'), self.opts,
|
self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts,
|
||||||
self.input_fmt, self.log,
|
self.input_fmt, self.log,
|
||||||
accelerators, tdir)
|
accelerators, tdir)
|
||||||
html_preprocessor = HTMLPreProcessor()
|
if not hasattr(self.oeb, 'manifest'):
|
||||||
self.reader = OEBReader()
|
self.oeb = create_oebbook(self.log, self.oeb)
|
||||||
self.oeb = OEBBook(self.log, html_preprocessor=html_preprocessor)
|
|
||||||
# Read OEB Book into OEBBook
|
|
||||||
self.log.info('Parsing all content...')
|
|
||||||
self.reader(self.oeb, opfpath)
|
|
||||||
|
|
||||||
self.opts.source = self.opts.input_profile
|
self.opts.source = self.opts.input_profile
|
||||||
self.opts.dest = self.opts.output_profile
|
self.opts.dest = self.opts.output_profile
|
||||||
@ -340,7 +334,20 @@ OptionRecommendation(name='language',
|
|||||||
trimmer(self.oeb, self.opts)
|
trimmer(self.oeb, self.opts)
|
||||||
|
|
||||||
self.log.info('Creating %s...'%self.output_plugin.name)
|
self.log.info('Creating %s...'%self.output_plugin.name)
|
||||||
self.output_plugin.convert(self.oeb, self.output, self.input_plugin, self.opts,
|
self.output_plugin.convert(self.oeb, self.output, self.input_plugin,
|
||||||
self.log)
|
self.opts, self.log)
|
||||||
|
|
||||||
|
def create_oebbook(log, opfpath):
|
||||||
|
'''
|
||||||
|
Create an OEBBook from an OPF file.
|
||||||
|
'''
|
||||||
|
from calibre.ebooks.oeb.reader import OEBReader
|
||||||
|
from calibre.ebooks.oeb.base import OEBBook
|
||||||
|
html_preprocessor = HTMLPreProcessor()
|
||||||
|
reader = OEBReader()
|
||||||
|
oeb = OEBBook(log, html_preprocessor=html_preprocessor)
|
||||||
|
# Read OEB Book into OEBBook
|
||||||
|
log.info('Parsing all content...')
|
||||||
|
reader(oeb, opfpath)
|
||||||
|
return oeb
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@ import sys, textwrap, re, os, uuid
|
|||||||
from itertools import cycle
|
from itertools import cycle
|
||||||
from calibre.utils.config import Config, StringConfig
|
from calibre.utils.config import Config, StringConfig
|
||||||
from calibre.utils.zipfile import ZipFile, ZIP_STORED
|
from calibre.utils.zipfile import ZipFile, ZIP_STORED
|
||||||
from calibre.ebooks.html import config as common_config, tostring
|
from calibre.ebooks.html import tostring
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
class DefaultProfile(object):
|
class DefaultProfile(object):
|
||||||
|
@ -14,7 +14,7 @@ from lxml.cssselect import CSSSelector
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
from lxml.html import HtmlElement
|
from lxml.html import HtmlElement
|
||||||
|
|
||||||
from calibre.ebooks.html import fromstring
|
from calibre.ebooks.html_old import fromstring
|
||||||
from calibre.ebooks.epub import rules
|
from calibre.ebooks.epub import rules
|
||||||
from cssutils import CSSParser
|
from cssutils import CSSParser
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ from lxml.etree import XPath
|
|||||||
from lxml import html, etree
|
from lxml import html, etree
|
||||||
from PyQt4.Qt import QApplication, QPixmap
|
from PyQt4.Qt import QApplication, QPixmap
|
||||||
|
|
||||||
from calibre.ebooks.html import Processor, merge_metadata, get_filelist,\
|
from calibre.ebooks.html_old import Processor, merge_metadata, get_filelist,\
|
||||||
opf_traverse, create_metadata, rebase_toc, Link, parser
|
opf_traverse, create_metadata, rebase_toc, Link, parser
|
||||||
from calibre.ebooks.epub import config as common_config, tostring
|
from calibre.ebooks.epub import config as common_config, tostring
|
||||||
from calibre.ptempfile import TemporaryDirectory
|
from calibre.ptempfile import TemporaryDirectory
|
||||||
|
@ -16,7 +16,7 @@ from calibre.ebooks.epub import config
|
|||||||
from calibre.ebooks.metadata.opf2 import OPF
|
from calibre.ebooks.metadata.opf2 import OPF
|
||||||
from calibre.ptempfile import TemporaryDirectory
|
from calibre.ptempfile import TemporaryDirectory
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.ebooks.html import create_dir
|
from calibre.ebooks.html_old import create_dir
|
||||||
from calibre.utils.zipfile import safe_replace, ZipFile
|
from calibre.utils.zipfile import safe_replace, ZipFile
|
||||||
from calibre.utils.config import DynamicConfig
|
from calibre.utils.config import DynamicConfig
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
Split the flows in an epub file to conform to size limitations.
|
Split the flows in an epub file to conform to size limitations.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import os, math, logging, functools, collections, re, copy, sys
|
import os, math, functools, collections, re, copy, sys
|
||||||
|
|
||||||
from lxml.etree import XPath as _XPath
|
from lxml.etree import XPath as _XPath
|
||||||
from lxml import etree, html
|
from lxml import etree, html
|
||||||
|
30
src/calibre/ebooks/html/__init__.py
Normal file
30
src/calibre/ebooks/html/__init__.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import with_statement
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from lxml.etree import tostring as _tostring
|
||||||
|
|
||||||
|
def tostring(root, strip_comments=False, pretty_print=False):
|
||||||
|
'''
|
||||||
|
Serialize processed XHTML.
|
||||||
|
'''
|
||||||
|
root.set('xmlns', 'http://www.w3.org/1999/xhtml')
|
||||||
|
root.set('{http://www.w3.org/1999/xhtml}xlink', 'http://www.w3.org/1999/xlink')
|
||||||
|
for x in root.iter():
|
||||||
|
if x.tag.rpartition('}')[-1].lower() == 'svg':
|
||||||
|
x.set('xmlns', 'http://www.w3.org/2000/svg')
|
||||||
|
|
||||||
|
ans = _tostring(root, encoding='utf-8', pretty_print=pretty_print)
|
||||||
|
if strip_comments:
|
||||||
|
ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans)
|
||||||
|
ans = '<?xml version="1.0" encoding="utf-8" ?>\n'+ans
|
||||||
|
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
342
src/calibre/ebooks/html/input.py
Normal file
342
src/calibre/ebooks/html/input.py
Normal file
@ -0,0 +1,342 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import with_statement
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
Input plugin for HTML or OPF ebooks.
|
||||||
|
'''
|
||||||
|
|
||||||
|
import os, re, sys, cStringIO
|
||||||
|
from urlparse import urlparse, urlunparse
|
||||||
|
from urllib import unquote
|
||||||
|
|
||||||
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
|
from calibre.ebooks.metadata.meta import get_metadata
|
||||||
|
from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
|
||||||
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
from calibre.customize.conversion import OptionRecommendation
|
||||||
|
from calibre import unicode_path
|
||||||
|
|
||||||
|
class Link(object):
|
||||||
|
'''
|
||||||
|
Represents a link in a HTML file.
|
||||||
|
'''
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def url_to_local_path(cls, url, base):
|
||||||
|
path = urlunparse(('', '', url.path, url.params, url.query, ''))
|
||||||
|
path = unquote(path)
|
||||||
|
if os.path.isabs(path):
|
||||||
|
return path
|
||||||
|
return os.path.abspath(os.path.join(base, path))
|
||||||
|
|
||||||
|
def __init__(self, url, base):
|
||||||
|
'''
|
||||||
|
:param url: The url this link points to. Must be an unquoted unicode string.
|
||||||
|
:param base: The base directory that relative URLs are with respect to.
|
||||||
|
Must be a unicode string.
|
||||||
|
'''
|
||||||
|
assert isinstance(url, unicode) and isinstance(base, unicode)
|
||||||
|
self.url = url
|
||||||
|
self.parsed_url = urlparse(self.url)
|
||||||
|
self.is_local = self.parsed_url.scheme in ('', 'file')
|
||||||
|
self.is_internal = self.is_local and not bool(self.parsed_url.path)
|
||||||
|
self.path = None
|
||||||
|
self.fragment = unquote(self.parsed_url.fragment)
|
||||||
|
if self.is_local and not self.is_internal:
|
||||||
|
self.path = self.url_to_local_path(self.parsed_url, base)
|
||||||
|
|
||||||
|
def __hash__(self):
|
||||||
|
if self.path is None:
|
||||||
|
return hash(self.url)
|
||||||
|
return hash(self.path)
|
||||||
|
|
||||||
|
def __eq__(self, other):
|
||||||
|
return self.path == getattr(other, 'path', other)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return u'Link: %s --> %s'%(self.url, self.path)
|
||||||
|
|
||||||
|
|
||||||
|
class IgnoreFile(Exception):
|
||||||
|
|
||||||
|
def __init__(self, msg, errno):
|
||||||
|
Exception.__init__(self, msg)
|
||||||
|
self.doesnt_exist = errno == 2
|
||||||
|
self.errno = errno
|
||||||
|
|
||||||
|
class HTMLFile(object):
|
||||||
|
'''
|
||||||
|
Contains basic information about an HTML file. This
|
||||||
|
includes a list of links to other files as well as
|
||||||
|
the encoding of each file. Also tries to detect if the file is not a HTML
|
||||||
|
file in which case :member:`is_binary` is set to True.
|
||||||
|
|
||||||
|
The encoding of the file is available as :member:`encoding`.
|
||||||
|
'''
|
||||||
|
|
||||||
|
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
|
||||||
|
TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
|
||||||
|
LINK_PAT = re.compile(
|
||||||
|
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
|
||||||
|
re.DOTALL|re.IGNORECASE)
|
||||||
|
|
||||||
|
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
|
||||||
|
'''
|
||||||
|
:param level: The level of this file. Should be 0 for the root file.
|
||||||
|
:param encoding: Use `encoding` to decode HTML.
|
||||||
|
:param referrer: The :class:`HTMLFile` that first refers to this file.
|
||||||
|
'''
|
||||||
|
self.path = unicode_path(path_to_html_file, abs=True)
|
||||||
|
self.title = os.path.splitext(os.path.basename(self.path))[0]
|
||||||
|
self.base = os.path.dirname(self.path)
|
||||||
|
self.level = level
|
||||||
|
self.referrer = referrer
|
||||||
|
self.links = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(self.path, 'rb') as f:
|
||||||
|
src = f.read()
|
||||||
|
except IOError, err:
|
||||||
|
msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
|
||||||
|
if level == 0:
|
||||||
|
raise IOError(msg)
|
||||||
|
raise IgnoreFile(msg, err.errno)
|
||||||
|
|
||||||
|
self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
|
||||||
|
if not self.is_binary:
|
||||||
|
if encoding is None:
|
||||||
|
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
|
||||||
|
self.encoding = encoding
|
||||||
|
else:
|
||||||
|
self.encoding = encoding
|
||||||
|
|
||||||
|
src = src.decode(encoding, 'replace')
|
||||||
|
match = self.TITLE_PAT.search(src)
|
||||||
|
self.title = match.group(1) if match is not None else self.title
|
||||||
|
self.find_links(src)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def __eq__(self, other):
|
||||||
|
return self.path == getattr(other, 'path', other)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return str(self)
|
||||||
|
|
||||||
|
|
||||||
|
def find_links(self, src):
|
||||||
|
for match in self.LINK_PAT.finditer(src):
|
||||||
|
url = None
|
||||||
|
for i in ('url1', 'url2', 'url3'):
|
||||||
|
url = match.group(i)
|
||||||
|
if url:
|
||||||
|
break
|
||||||
|
link = self.resolve(url)
|
||||||
|
if link not in self.links:
|
||||||
|
self.links.append(link)
|
||||||
|
|
||||||
|
def resolve(self, url):
|
||||||
|
return Link(url, self.base)
|
||||||
|
|
||||||
|
|
||||||
|
def depth_first(root, flat, visited=set([])):
|
||||||
|
yield root
|
||||||
|
visited.add(root)
|
||||||
|
for link in root.links:
|
||||||
|
if link.path is not None and link not in visited:
|
||||||
|
try:
|
||||||
|
index = flat.index(link)
|
||||||
|
except ValueError: # Can happen if max_levels is used
|
||||||
|
continue
|
||||||
|
hf = flat[index]
|
||||||
|
if hf not in visited:
|
||||||
|
yield hf
|
||||||
|
visited.add(hf)
|
||||||
|
for hf in depth_first(hf, flat, visited):
|
||||||
|
if hf not in visited:
|
||||||
|
yield hf
|
||||||
|
visited.add(hf)
|
||||||
|
|
||||||
|
|
||||||
|
def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
|
||||||
|
'''
|
||||||
|
Recursively traverse all links in the HTML file.
|
||||||
|
|
||||||
|
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
|
||||||
|
implies that no links in the root HTML file are followed.
|
||||||
|
:param encoding: Specify character encoding of HTML files. If `None` it is
|
||||||
|
auto-detected.
|
||||||
|
:return: A pair of lists (breadth_first, depth_first). Each list contains
|
||||||
|
:class:`HTMLFile` objects.
|
||||||
|
'''
|
||||||
|
assert max_levels >= 0
|
||||||
|
level = 0
|
||||||
|
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
|
||||||
|
next_level = list(flat)
|
||||||
|
while level < max_levels and len(next_level) > 0:
|
||||||
|
level += 1
|
||||||
|
nl = []
|
||||||
|
for hf in next_level:
|
||||||
|
rejects = []
|
||||||
|
for link in hf.links:
|
||||||
|
if link.path is None or link.path in flat:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
|
||||||
|
if nf.is_binary:
|
||||||
|
raise IgnoreFile('%s is a binary file'%nf.path, -1)
|
||||||
|
nl.append(nf)
|
||||||
|
flat.append(nf)
|
||||||
|
except IgnoreFile, err:
|
||||||
|
rejects.append(link)
|
||||||
|
if not err.doesnt_exist or verbose > 1:
|
||||||
|
print repr(err)
|
||||||
|
for link in rejects:
|
||||||
|
hf.links.remove(link)
|
||||||
|
|
||||||
|
next_level = list(nl)
|
||||||
|
orec = sys.getrecursionlimit()
|
||||||
|
sys.setrecursionlimit(500000)
|
||||||
|
try:
|
||||||
|
return flat, list(depth_first(flat[0], flat))
|
||||||
|
finally:
|
||||||
|
sys.setrecursionlimit(orec)
|
||||||
|
|
||||||
|
|
||||||
|
def opf_traverse(opf_reader, verbose=0, encoding=None):
|
||||||
|
'''
|
||||||
|
Return a list of :class:`HTMLFile` objects in the order specified by the
|
||||||
|
`<spine>` element of the OPF.
|
||||||
|
|
||||||
|
:param opf_reader: An :class:`calibre.ebooks.metadata.opf2.OPF` instance.
|
||||||
|
:param encoding: Specify character encoding of HTML files. If `None` it is
|
||||||
|
auto-detected.
|
||||||
|
'''
|
||||||
|
if not opf_reader.spine:
|
||||||
|
raise ValueError('OPF does not have a spine')
|
||||||
|
flat = []
|
||||||
|
for path in opf_reader.spine.items():
|
||||||
|
path = os.path.abspath(path)
|
||||||
|
if path not in flat:
|
||||||
|
flat.append(os.path.abspath(path))
|
||||||
|
for item in opf_reader.manifest:
|
||||||
|
if 'html' in item.mime_type:
|
||||||
|
path = os.path.abspath(item.path)
|
||||||
|
if path not in flat:
|
||||||
|
flat.append(path)
|
||||||
|
for i, path in enumerate(flat):
|
||||||
|
if not os.path.exists(path):
|
||||||
|
path = path.replace('&', '%26')
|
||||||
|
if os.path.exists(path):
|
||||||
|
flat[i] = path
|
||||||
|
for item in opf_reader.itermanifest():
|
||||||
|
item.set('href', item.get('href').replace('&', '%26'))
|
||||||
|
ans = []
|
||||||
|
for path in flat:
|
||||||
|
if os.path.exists(path):
|
||||||
|
ans.append(HTMLFile(path, 0, encoding, verbose))
|
||||||
|
else:
|
||||||
|
print 'WARNING: OPF spine item %s does not exist'%path
|
||||||
|
ans = [f for f in ans if not f.is_binary]
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def search_for_opf(dir):
|
||||||
|
for f in os.listdir(dir):
|
||||||
|
if f.lower().endswith('.opf'):
|
||||||
|
return OPF(open(os.path.join(dir, f), 'rb'), dir)
|
||||||
|
|
||||||
|
def get_filelist(htmlfile, dir, opts, log):
|
||||||
|
'''
|
||||||
|
Build list of files referenced by html file or try to detect and use an
|
||||||
|
OPF file instead.
|
||||||
|
'''
|
||||||
|
print 'Building file list...'
|
||||||
|
opf = search_for_opf(dir)
|
||||||
|
filelist = None
|
||||||
|
if opf is not None:
|
||||||
|
try:
|
||||||
|
filelist = opf_traverse(opf, verbose=opts.verbose,
|
||||||
|
encoding=opts.input_encoding)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
if not filelist:
|
||||||
|
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
||||||
|
verbose=opts.verbose,
|
||||||
|
encoding=opts.input_encoding)\
|
||||||
|
[0 if opts.breadth_first else 1]
|
||||||
|
if opts.verbose:
|
||||||
|
log.debug('\tFound files...')
|
||||||
|
for f in filelist:
|
||||||
|
log.debug('\t\t', f)
|
||||||
|
return opf, filelist
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLInput(InputFormatPlugin):
|
||||||
|
|
||||||
|
name = 'HTML Input'
|
||||||
|
author = 'Kovid Goyal'
|
||||||
|
description = 'Convert HTML and OPF files to an OEB'
|
||||||
|
file_types = set(['opf', 'html', 'htm', 'xhtml', 'xhtm'])
|
||||||
|
|
||||||
|
options = set([
|
||||||
|
OptionRecommendation(name='breadth_first',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Traverse links in HTML files breadth first. Normally, '
|
||||||
|
'they are traversed depth first.'
|
||||||
|
)
|
||||||
|
),
|
||||||
|
|
||||||
|
OptionRecommendation(name='max_levels',
|
||||||
|
recommended_value=5, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Maximum levels of recursion when following links in '
|
||||||
|
'HTML files. Must be non-negative. 0 implies that no '
|
||||||
|
'links in the root HTML file are followed. Default is '
|
||||||
|
'%default.'
|
||||||
|
)
|
||||||
|
),
|
||||||
|
|
||||||
|
])
|
||||||
|
|
||||||
|
def convert(self, stream, opts, file_ext, log,
|
||||||
|
accelerators):
|
||||||
|
basedir = os.getcwd()
|
||||||
|
if hasattr(stream, 'name'):
|
||||||
|
basedir = os.path.dirname(stream.name)
|
||||||
|
if file_ext == 'opf':
|
||||||
|
opf = OPF(stream, basedir)
|
||||||
|
filelist = opf_traverse(opf, verbose=opts.verbose,
|
||||||
|
encoding=opts.input_encoding)
|
||||||
|
mi = MetaInformation(opf)
|
||||||
|
else:
|
||||||
|
opf, filelist = get_filelist(stream.name, basedir, opts, log)
|
||||||
|
mi = MetaInformation(opf)
|
||||||
|
mi.smart_update(get_metadata(stream, 'html'))
|
||||||
|
|
||||||
|
mi = OPFCreator(os.getcwdu(), mi)
|
||||||
|
mi.guide = None
|
||||||
|
entries = [(f.path, 'application/xhtml+xml') for f in filelist]
|
||||||
|
mi.create_manifest(entries)
|
||||||
|
mi.create_spine([f.path for f in filelist])
|
||||||
|
|
||||||
|
tocbuf = cStringIO.StringIO()
|
||||||
|
mi.render(open('metadata.opf', 'wb'), tocbuf, 'toc.ncx')
|
||||||
|
toc = tocbuf.getvalue()
|
||||||
|
if toc:
|
||||||
|
open('toc.ncx', 'wb').write(toc)
|
||||||
|
|
||||||
|
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||||
|
return create_oebbook(log, os.path.abspath('metadata.opf'))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -683,26 +683,6 @@ class OPF(object):
|
|||||||
|
|
||||||
return property(fget=fget, fset=fset)
|
return property(fget=fget, fset=fset)
|
||||||
|
|
||||||
@dynamic_property
|
|
||||||
def title_sort(self):
|
|
||||||
|
|
||||||
def fget(self):
|
|
||||||
matches = self.title_path(self.metadata)
|
|
||||||
if matches:
|
|
||||||
for match in matches:
|
|
||||||
ans = match.get('{%s}file-as'%self.NAMESPACES['opf'], None)
|
|
||||||
if not ans:
|
|
||||||
ans = match.get('file-as', None)
|
|
||||||
if ans:
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def fset(self, val):
|
|
||||||
matches = self.title_path(self.metadata)
|
|
||||||
if matches:
|
|
||||||
matches[0].set('file-as', unicode(val))
|
|
||||||
|
|
||||||
return property(fget=fget, fset=fset)
|
|
||||||
|
|
||||||
@dynamic_property
|
@dynamic_property
|
||||||
def tags(self):
|
def tags(self):
|
||||||
|
|
||||||
@ -943,9 +923,10 @@ class OPFCreator(MetaInformation):
|
|||||||
from calibre.resources import opf_template
|
from calibre.resources import opf_template
|
||||||
from calibre.utils.genshi.template import MarkupTemplate
|
from calibre.utils.genshi.template import MarkupTemplate
|
||||||
template = MarkupTemplate(opf_template)
|
template = MarkupTemplate(opf_template)
|
||||||
|
toc = getattr(self, 'toc', None)
|
||||||
if self.manifest:
|
if self.manifest:
|
||||||
self.manifest.set_basedir(self.base_path)
|
self.manifest.set_basedir(self.base_path)
|
||||||
if ncx_manifest_entry is not None:
|
if ncx_manifest_entry is not None and toc is not None:
|
||||||
if not os.path.isabs(ncx_manifest_entry):
|
if not os.path.isabs(ncx_manifest_entry):
|
||||||
ncx_manifest_entry = os.path.join(self.base_path, ncx_manifest_entry)
|
ncx_manifest_entry = os.path.join(self.base_path, ncx_manifest_entry)
|
||||||
remove = [i for i in self.manifest if i.id == 'ncx']
|
remove = [i for i in self.manifest if i.id == 'ncx']
|
||||||
@ -965,7 +946,6 @@ class OPFCreator(MetaInformation):
|
|||||||
opf = template.generate(__appname__=__appname__, mi=self, __version__=__version__).render('xml')
|
opf = template.generate(__appname__=__appname__, mi=self, __version__=__version__).render('xml')
|
||||||
opf_stream.write(opf)
|
opf_stream.write(opf)
|
||||||
opf_stream.flush()
|
opf_stream.flush()
|
||||||
toc = getattr(self, 'toc', None)
|
|
||||||
if toc is not None and ncx_stream is not None:
|
if toc is not None and ncx_stream is not None:
|
||||||
toc.render(ncx_stream, self.application_id)
|
toc.render(ncx_stream, self.application_id)
|
||||||
ncx_stream.flush()
|
ncx_stream.flush()
|
||||||
@ -1030,17 +1010,6 @@ class OPFTest(unittest.TestCase):
|
|||||||
self.opf.smart_update(MetaInformation(self.opf))
|
self.opf.smart_update(MetaInformation(self.opf))
|
||||||
self.testReading()
|
self.testReading()
|
||||||
|
|
||||||
def testCreator(self):
|
|
||||||
opf = OPFCreator(os.getcwd(), self.opf)
|
|
||||||
buf = cStringIO.StringIO()
|
|
||||||
opf.render(buf)
|
|
||||||
raw = buf.getvalue()
|
|
||||||
self.testReading(opf=OPF(cStringIO.StringIO(raw), os.getcwd()))
|
|
||||||
|
|
||||||
def testSmartUpdate(self):
|
|
||||||
self.opf.smart_update(self.opf)
|
|
||||||
self.testReading()
|
|
||||||
|
|
||||||
def suite():
|
def suite():
|
||||||
return unittest.TestLoader().loadTestsFromTestCase(OPFTest)
|
return unittest.TestLoader().loadTestsFromTestCase(OPFTest)
|
||||||
|
|
||||||
|
@ -29,5 +29,5 @@ class MOBIInput(InputFormatPlugin):
|
|||||||
with open(f, 'wb') as q:
|
with open(f, 'wb') as q:
|
||||||
q.write(html.tostring(root, encoding='utf-8', method='xml',
|
q.write(html.tostring(root, encoding='utf-8', method='xml',
|
||||||
include_meta_content_type=False))
|
include_meta_content_type=False))
|
||||||
accelerators['pagebreaks'] = {f: '//div[@class="mbp_pagebreak"]'}
|
accelerators['pagebreaks'] = {f: '//*[@class="mbp_pagebreak"]'}
|
||||||
return mr.created_opf_path
|
return mr.created_opf_path
|
||||||
|
@ -522,7 +522,7 @@ class MobiReader(object):
|
|||||||
else:
|
else:
|
||||||
raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type))
|
raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type))
|
||||||
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
|
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
|
||||||
self.mobi_html = self.mobi_html.replace('\r ', '\n\n ')
|
self.mobi_html = self.mobi_html.replace('\r ', '\n\n').replace('\0', '')
|
||||||
return processed_records
|
return processed_records
|
||||||
|
|
||||||
|
|
||||||
|
@ -151,7 +151,7 @@ def resolve_base_href(root):
|
|||||||
return
|
return
|
||||||
make_links_absolute(root, base_href, resolve_base_href=False)
|
make_links_absolute(root, base_href, resolve_base_href=False)
|
||||||
|
|
||||||
def rewrite_links(root, link_repl_func, resolve_base_href=True):
|
def rewrite_links(root, link_repl_func, resolve_base_href=False):
|
||||||
'''
|
'''
|
||||||
Rewrite all the links in the document. For each link
|
Rewrite all the links in the document. For each link
|
||||||
``link_repl_func(link)`` will be called, and the return value
|
``link_repl_func(link)`` will be called, and the return value
|
||||||
|
@ -6,9 +6,16 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import os, shutil
|
import os
|
||||||
|
from urllib import unquote as urlunquote
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
from calibre.ebooks.oeb.base import OEB_DOCS
|
from lxml import etree
|
||||||
|
import cssutils
|
||||||
|
|
||||||
|
from calibre.constants import islinux
|
||||||
|
from calibre.ebooks.oeb.base import OEB_DOCS, urlnormalize, urldefrag, \
|
||||||
|
rewrite_links
|
||||||
|
|
||||||
class Package(object):
|
class Package(object):
|
||||||
|
|
||||||
@ -29,18 +36,69 @@ class Package(object):
|
|||||||
self.new_base_path = os.path.abspath(base)
|
self.new_base_path = os.path.abspath(base)
|
||||||
|
|
||||||
def rewrite_links_in(self, item):
|
def rewrite_links_in(self, item):
|
||||||
new_items = []
|
base = os.path.join(self.new_base_path, *item.href.split('/'))
|
||||||
return new_items
|
base = os.path.dirname(base)
|
||||||
|
|
||||||
|
if etree.iselement(item.data):
|
||||||
|
self.rewrite_links_in_xml(item.data, base)
|
||||||
|
elif hasattr(item.data, 'cssText'):
|
||||||
|
self.rewrite_links_in_css(item.data, base)
|
||||||
|
|
||||||
|
def link_replacer(self, link_, base=''):
|
||||||
|
link = urlnormalize(link_)
|
||||||
|
link, frag = urldefrag(link)
|
||||||
|
link = urlunquote(link).replace('/', os.sep)
|
||||||
|
if base and not os.path.isabs(link):
|
||||||
|
link = os.path.join(base, link)
|
||||||
|
link = os.path.abspath(link)
|
||||||
|
if not islinux:
|
||||||
|
link = link.lower()
|
||||||
|
if link not in self.map:
|
||||||
|
return link_
|
||||||
|
nlink = os.path.relpath(self.map[link], base)
|
||||||
|
if frag:
|
||||||
|
nlink = '#'.join(nlink, frag)
|
||||||
|
return nlink.replace(os.sep, '/')
|
||||||
|
|
||||||
|
def rewrite_links_in_css(self, sheet, base):
|
||||||
|
repl = partial(self.link_replacer, base=base)
|
||||||
|
cssutils.replaceUrls(sheet, repl)
|
||||||
|
|
||||||
|
def rewrite_links_in_xml(self, root, base):
|
||||||
|
repl = partial(self.link_replacer, base=base)
|
||||||
|
rewrite_links(root, repl)
|
||||||
|
|
||||||
def move_manifest_item(self, item):
|
def move_manifest_item(self, item):
|
||||||
item.data # Make sure the data has been loaded and cached
|
item.data # Make sure the data has been loaded and cached
|
||||||
old_abspath = os.path.join(self.old_base_path, *item.href.split('/'))
|
old_abspath = os.path.join(self.old_base_path,
|
||||||
bname = item.href.split('/')[-1]
|
*(urldefrag(item.href)[0].split('/')))
|
||||||
new_href = 'content/' + \
|
old_abspath = os.path.abspath(old_abspath)
|
||||||
('resources/' if item.media_type in OEB_DOCS else '')+bname
|
bname = item.href.split('/')[-1].partition('#')[0]
|
||||||
|
new_href = 'content/resources/'
|
||||||
|
if item.media_type in OEB_DOCS:
|
||||||
|
new_href = 'content/'
|
||||||
|
elif item.href.lower().endswith('.ncx'):
|
||||||
|
new_href = ''
|
||||||
|
new_href += bname
|
||||||
|
|
||||||
|
new_abspath = os.path.join(self.new_base_path, *new_href.split('/'))
|
||||||
|
new_abspath = os.path.abspath(new_abspath)
|
||||||
|
item.href = new_href
|
||||||
|
if not islinux:
|
||||||
|
old_abspath, new_abspath = old_abspath.lower(), new_abspath.lower()
|
||||||
|
if old_abspath != new_abspath:
|
||||||
|
self.map[old_abspath] = new_abspath
|
||||||
|
|
||||||
|
def rewrite_links_in_toc(self, toc):
|
||||||
|
if toc.href:
|
||||||
|
toc.href = self.link_replacer(toc.href, base=self.new_base_path)
|
||||||
|
|
||||||
|
for x in toc:
|
||||||
|
self.rewrite_links_in_toc(x)
|
||||||
|
|
||||||
def __call__(self, oeb, context):
|
def __call__(self, oeb, context):
|
||||||
self.map = {}
|
self.map = {}
|
||||||
|
self.log = self.oeb.log
|
||||||
self.old_base_path = os.path.abspath(oeb.container.rootdir)
|
self.old_base_path = os.path.abspath(oeb.container.rootdir)
|
||||||
|
|
||||||
for item in self.oeb.manifest:
|
for item in self.oeb.manifest:
|
||||||
@ -49,4 +107,9 @@ class Package(object):
|
|||||||
for item in self.oeb.manifest:
|
for item in self.oeb.manifest:
|
||||||
self.rewrite_links_in(item)
|
self.rewrite_links_in(item)
|
||||||
|
|
||||||
|
if getattr(oeb.toc, 'nodes', False):
|
||||||
|
self.rewrite_links_in_toc(oeb.toc)
|
||||||
|
|
||||||
|
if hasattr(oeb, 'guide'):
|
||||||
|
for ref in oeb.guide.values():
|
||||||
|
ref.href = self.link_replacer(ref.href, base=self.new_base_path)
|
||||||
|
@ -6,11 +6,12 @@ from __future__ import with_statement
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||||
|
|
||||||
from itertools import chain
|
|
||||||
from urlparse import urldefrag
|
from urlparse import urldefrag
|
||||||
|
|
||||||
|
import cssutils
|
||||||
|
|
||||||
from calibre.ebooks.oeb.base import CSS_MIME, OEB_DOCS
|
from calibre.ebooks.oeb.base import CSS_MIME, OEB_DOCS
|
||||||
from calibre.ebooks.oeb.base import LINK_SELECTORS, CSSURL_RE
|
from calibre.ebooks.oeb.base import urlnormalize, iterlinks
|
||||||
from calibre.ebooks.oeb.base import urlnormalize
|
|
||||||
|
|
||||||
class ManifestTrimmer(object):
|
class ManifestTrimmer(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -44,16 +45,15 @@ class ManifestTrimmer(object):
|
|||||||
if (item.media_type in OEB_DOCS or
|
if (item.media_type in OEB_DOCS or
|
||||||
item.media_type[-4:] in ('/xml', '+xml')) and \
|
item.media_type[-4:] in ('/xml', '+xml')) and \
|
||||||
item.data is not None:
|
item.data is not None:
|
||||||
hrefs = [sel(item.data) for sel in LINK_SELECTORS]
|
hrefs = [r[2] for r in iterlinks(item.data)]
|
||||||
for href in chain(*hrefs):
|
for href in hrefs:
|
||||||
href = item.abshref(urlnormalize(href))
|
href = item.abshref(urlnormalize(href))
|
||||||
if href in oeb.manifest.hrefs:
|
if href in oeb.manifest.hrefs:
|
||||||
found = oeb.manifest.hrefs[href]
|
found = oeb.manifest.hrefs[href]
|
||||||
if found not in used:
|
if found not in used:
|
||||||
new.add(found)
|
new.add(found)
|
||||||
elif item.media_type == CSS_MIME:
|
elif item.media_type == CSS_MIME:
|
||||||
for match in CSSURL_RE.finditer(item.data.cssText):
|
for href in cssutils.getUrls(item.data):
|
||||||
href = match.group('url')
|
|
||||||
href = item.abshref(urlnormalize(href))
|
href = item.abshref(urlnormalize(href))
|
||||||
if href in oeb.manifest.hrefs:
|
if href in oeb.manifest.hrefs:
|
||||||
found = oeb.manifest.hrefs[href]
|
found = oeb.manifest.hrefs[href]
|
||||||
|
@ -22,9 +22,6 @@ entry_points = {
|
|||||||
'web2disk = calibre.web.fetch.simple:main',
|
'web2disk = calibre.web.fetch.simple:main',
|
||||||
'feeds2disk = calibre.web.feeds.main:main',
|
'feeds2disk = calibre.web.feeds.main:main',
|
||||||
'calibre-server = calibre.library.server:main',
|
'calibre-server = calibre.library.server:main',
|
||||||
'feeds2lrf = calibre.ebooks.lrf.feeds.convert_from:main',
|
|
||||||
'feeds2epub = calibre.ebooks.epub.from_feeds:main',
|
|
||||||
'feeds2mobi = calibre.ebooks.mobi.from_feeds:main',
|
|
||||||
'web2lrf = calibre.ebooks.lrf.web.convert_from:main',
|
'web2lrf = calibre.ebooks.lrf.web.convert_from:main',
|
||||||
'lrf2lrs = calibre.ebooks.lrf.lrfparser:main',
|
'lrf2lrs = calibre.ebooks.lrf.lrfparser:main',
|
||||||
'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main',
|
'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main',
|
||||||
@ -154,10 +151,7 @@ def setup_completion(fatal_errors):
|
|||||||
from calibre.ebooks.lrf.pdf.reflow import option_parser as pdfhtmlop
|
from calibre.ebooks.lrf.pdf.reflow import option_parser as pdfhtmlop
|
||||||
from calibre.web.feeds.main import option_parser as feeds2disk
|
from calibre.web.feeds.main import option_parser as feeds2disk
|
||||||
from calibre.web.feeds.recipes import titles as feed_titles
|
from calibre.web.feeds.recipes import titles as feed_titles
|
||||||
from calibre.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf
|
|
||||||
from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop
|
from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop
|
||||||
from calibre.ebooks.epub.from_feeds import option_parser as feeds2epub
|
|
||||||
from calibre.ebooks.mobi.from_feeds import option_parser as feeds2mobi
|
|
||||||
from calibre.ebooks.epub.from_comic import option_parser as comic2epub
|
from calibre.ebooks.epub.from_comic import option_parser as comic2epub
|
||||||
from calibre.ebooks.metadata.fetch import option_parser as fem_op
|
from calibre.ebooks.metadata.fetch import option_parser as fem_op
|
||||||
from calibre.gui2.main import option_parser as guiop
|
from calibre.gui2.main import option_parser as guiop
|
||||||
@ -192,9 +186,6 @@ def setup_completion(fatal_errors):
|
|||||||
f.write(opts_and_exts('comic2mobi', comic2epub, ['cbz', 'cbr']))
|
f.write(opts_and_exts('comic2mobi', comic2epub, ['cbz', 'cbr']))
|
||||||
f.write(opts_and_exts('comic2pdf', comic2epub, ['cbz', 'cbr']))
|
f.write(opts_and_exts('comic2pdf', comic2epub, ['cbz', 'cbr']))
|
||||||
f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
|
f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
|
||||||
f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
|
|
||||||
f.write(opts_and_words('feeds2epub', feeds2epub, feed_titles))
|
|
||||||
f.write(opts_and_words('feeds2mobi', feeds2mobi, feed_titles))
|
|
||||||
f.write(opts_and_words('fetch-ebook-metadata', fem_op, []))
|
f.write(opts_and_words('fetch-ebook-metadata', fem_op, []))
|
||||||
f.write(opts_and_words('calibre-smtp', smtp_op, []))
|
f.write(opts_and_words('calibre-smtp', smtp_op, []))
|
||||||
f.write('''
|
f.write('''
|
||||||
|
Loading…
x
Reference in New Issue
Block a user