IGN:Working html2oeb

This commit is contained in:
Kovid Goyal 2008-08-26 12:11:03 -07:00
parent 39afcb27f7
commit 41a938aef0
6 changed files with 63 additions and 246 deletions

View File

@ -268,6 +268,9 @@ class LoggingInterface:
self.__logger = logger
def setup_cli_handler(self, verbosity):
for handler in self.__logger.handlers:
if isinstance(handler, logging.StreamHandler):
return
if os.environ.get('CALIBRE_WORKER', None) is not None and self.__logger.handlers:
return
stream = sys.stdout

View File

@ -19,6 +19,7 @@ def config(defaults=None):
c.update(common_config())
c.remove_opt('output')
c.remove_opt('zip')
c.add_opt('output', ['-o', '--output'], default=None,
help=_('The output EPUB file. If not specified, it is derived from the input file name.'))

View File

@ -1,218 +0,0 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Recursively parse HTML files to find all linked files. See :function:`traverse`.
'''
import sys, os, re
from urlparse import urlparse
from urllib import unquote
from calibre import unicode_path
from calibre.ebooks.chardet import xml_to_unicode
class Link(object):
'''
Represents a link in a HTML file.
'''
@classmethod
def url_to_local_path(cls, url, base):
path = url.path
if os.path.isabs(path):
return path
return os.path.abspath(os.path.join(base, path))
def __init__(self, url, base):
'''
:param url: The url this link points to. Must be an unquoted unicode string.
:param base: The base directory that relative URLs are with respect to.
Must be a unicode string.
'''
assert isinstance(url, unicode) and isinstance(base, unicode)
self.url = url
self.parsed_url = urlparse(unquote(self.url))
self.is_local = self.parsed_url.scheme in ('', 'file')
self.is_internal = self.is_local and not bool(self.parsed_url.path)
self.path = None
self.fragment = self.parsed_url.fragment
if self.is_local and not self.is_internal:
self.path = self.url_to_local_path(self.parsed_url, base)
def __hash__(self):
if self.path is None:
return hash(self.url)
return hash(self.path)
def __eq__(self, other):
return self.path == getattr(other, 'path', other)
def __str__(self):
return u'Link: %s --> %s'%(self.url, self.path)
class IgnoreFile(Exception):
def __init__(self, msg, errno):
Exception.__init__(self, msg)
self.doesnt_exist = errno == 2
self.errno = errno
class HTMLFile(object):
'''
Contains basic information about an HTML file. This
includes a list of links to other files as well as
the encoding of each file. Also tries to detect if the file is not a HTML
file in which case :member:`is_binary` is set to True.
The encoding of the file is available as :member:`encoding`.
'''
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
LINK_PAT = re.compile(
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s]+))',
re.DOTALL|re.IGNORECASE)
def __init__(self, path_to_html_file, level, encoding, verbose):
'''
:param level: The level of this file. Should be 0 for the root file.
:param encoding: Use `encoding` to decode HTML.
'''
self.path = unicode_path(path_to_html_file, abs=True)
self.base = os.path.dirname(self.path)
self.level = level
self.links = []
try:
with open(self.path, 'rb') as f:
src = f.read()
except IOError, err:
msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
if level == 0:
raise IOError(msg)
raise IgnoreFile(msg, err.errno)
self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
if not self.is_binary:
if encoding is None:
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
self.encoding = encoding
src = src.decode(encoding, 'replace')
self.find_links(src)
def __eq__(self, other):
return self.path == getattr(other, 'path', other)
def __str__(self):
return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
def __repr__(self):
return str(self)
def find_links(self, src):
for match in self.LINK_PAT.finditer(src):
url = None
for i in ('url1', 'url2', 'url3'):
url = match.group(i)
if url:
break
link = self.resolve(url)
if link not in self.links:
self.links.append(link)
def resolve(self, url):
return Link(url, self.base)
def depth_first(root, flat, visited=set([])):
yield root
visited.add(root)
for link in root.links:
if link.path is not None and link not in visited:
try:
index = flat.index(link)
except ValueError: # Can happen if max_levels is used
continue
hf = flat[index]
if hf not in visited:
yield hf
visited.add(hf)
for hf in depth_first(hf, flat, visited):
if hf not in visited:
yield hf
visited.add(hf)
def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
'''
Recursively traverse all links in the HTML file.
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
implies that no links in the root HTML file are followed.
:param encoding: Specify character encoding of HTML files. If `None` it is
auto-detected.
:return: A pair of lists (breadth_first, depth_first). Each list contains
:class:`HTMLFile` objects.
'''
assert max_levels >= 0
level = 0
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
next_level = list(flat)
while level < max_levels and len(next_level) > 0:
level += 1
nl = []
for hf in next_level:
rejects = []
for link in hf.links:
if link.path is None or link.path in flat:
continue
try:
nf = HTMLFile(link.path, level, encoding, verbose)
nl.append(nf)
flat.append(nf)
except IgnoreFile, err:
rejects.append(link)
if not err.doesnt_exist or verbose > 1:
print str(err)
for link in rejects:
hf.links.remove(link)
next_level = list(nl)
return flat, list(depth_first(flat[0], flat))
def opf_traverse(opf_reader, verbose=0, encoding=None):
'''
Return a list of :class:`HTMLFile` objects in the order specified by the
`<spine>` element of the OPF.
:param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance.
:param encoding: Specify character encoding of HTML files. If `None` it is
auto-detected.
'''
if not opf_reader.spine:
raise ValueError('OPF does not have a spine')
flat = []
for path in opf_reader.spine.items():
if path not in flat:
flat.append(os.path.abspath(path))
flat = [HTMLFile(path, 0, encoding, verbose) for path in flat]
return flat
if __name__ == '__main__':
breadth_first, depth_first = traverse(sys.argv[1], verbose=2)
print 'Breadth first...'
for f in breadth_first: print f
print '\n\nDepth first...'
for f in depth_first: print f

View File

@ -284,7 +284,7 @@ class PreProcessor(object):
return html
class Parser(PreProcessor):
class Parser(PreProcessor, LoggingInterface):
ENCODING_PATS = [re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>', re.IGNORECASE)]
@ -299,10 +299,32 @@ class Parser(PreProcessor):
self.resource_map = resource_map
self.htmlfiles = htmlfiles
self.resource_dir = os.path.join(tdir, 'resources')
save_counter = 1
self.htmlfile_map = {}
for f in self.htmlfiles:
name = os.path.basename(f.path)
if name in self.htmlfile_map.values():
name = os.path.splitext(name)[0] + '_cr_%d'%save_counter + os.path.splitext(name)[1]
save_counter += 1
self.htmlfile_map[f.path] = name
self.parse_html()
self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
def save(self):
'''
Save processed HTML into the content directory.
Should be called after all HTML processing is finished.
'''
with open(os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]), 'wb') as f:
f.write(html.tostring(self.root,
encoding='utf-8', method='xml',
include_meta_content_type=True,
pretty_print=self.opts.pretty_print)
)
return f.name
def parse_html(self):
''' Create lxml ElementTree from HTML '''
self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
@ -350,7 +372,7 @@ class Parser(PreProcessor):
if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
return olink
if link.path in self.htmlfiles:
return os.path.basename(link.path)
return self.htmlfile_map[link.path]
if link.path in self.resource_map.keys():
return self.resource_map[link.path]
name = os.path.basename(link.path)
@ -437,6 +459,8 @@ def config(defaults=None):
help=_('The output directory. Default is the current directory.'))
c.add_opt('encoding', ['--encoding'], default=None,
help=_('Character encoding for HTML files. Default is to auto detect.'))
c.add_opt('zip', ['--zip'], default=False,
help=_('Create the output in a zip file. If this option is specified, the --output should be the name of a file not a directory.'))
traversal = c.add_group('traversal', _('Control the following of links in HTML files.'))
traversal('breadth_first', ['--breadth-first'], default=False,
@ -453,6 +477,8 @@ def config(defaults=None):
debug = c.add_group('debug', _('Options useful for debugging'))
debug('verbose', ['-v', '--verbose'], default=0, action='count',
help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.'))
debug('pretty_print', ['--pretty-print'], default=False,
help=_('Output HTML is "pretty printed" for easier parsing by humans'))
return c
@ -487,7 +513,6 @@ def get_filelist(htmlfile, opts):
print '\tFound files...'
for f in filelist:
print '\t\t', f
return opf, filelist
def parse_content(filelist, opts):
@ -499,9 +524,10 @@ def parse_content(filelist, opts):
os.makedirs(rdir)
resource_map = {}
for htmlfile in filelist:
Parser(htmlfile, opts, os.path.join(opts.output, 'content'),
p = Parser(htmlfile, opts, os.path.join(opts.output, 'content'),
resource_map, filelist)
return resource_map
p.save()
return resource_map, p.htmlfile_map
def merge_metadata(htmlfile, opf, opts):
if opf:
@ -519,23 +545,27 @@ def merge_metadata(htmlfile, opf, opts):
mi.title = os.path.splitext(os.path.basename(htmlfile))[0]
if not mi.authors:
mi.authors = [_('Unknown')]
return mi
def create_metadata(basepath, mi, filelist, resources):
mi = OPFCreator(basepath, mi)
entries = [(f.path, None) for f in filelist] + [(f, None) for f in resources]
entries = [('content/'+f, None) for f in filelist] + [(f, None) for f in resources]
mi.create_manifest(entries)
mi.create_spine([f.path for f in filelist])
mi.create_spine(['content/'+f for f in filelist])
return mi
def create_dir(htmlfile, opts):
opf, filelist = get_filelist(htmlfile, opts)
mi = merge_metadata(htmlfile, opf, opts)
resources = [os.path.join(opts.output, 'content', f) for f in parse_content(filelist, opts).values()]
resource_map, htmlfile_map = parse_content(filelist, opts)
resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()]
if opf.cover and os.access(opf.cover, os.R_OK):
cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))
cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[-1])
shutil.copyfile(opf.cover, cpath)
resources.append(cpath)
mi = create_metadata(opts.output, mi, filelist, resources)
mi.cover = cpath
spine = [htmlfile_map[f.path] for f in filelist]
mi = create_metadata(opts.output, mi, spine, resources)
with open(os.path.join(opts.output, 'metadata.opf'), 'wb') as f:
mi.render(f)
print 'Open ebook created in', opts.output
@ -560,11 +590,12 @@ def main(args=sys.argv):
return 1
htmlfile = args[1]
if opts.zip:
create_oebzip(htmlfile, opts)
else:
create_dir(htmlfile, opts)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -19,13 +19,13 @@ NONE = QVariant()
class JobManager(QAbstractTableModel):
wait_icon = QVariant(QIcon(':/images/jobs.svg'))
running_icon = QVariant(QIcon(':/images/exec.svg'))
error_icon = QVariant(QIcon(':/images/dialog_error.svg'))
done_icon = QVariant(QIcon(':/images/ok.svg'))
def __init__(self):
QAbstractTableModel.__init__(self)
self.wait_icon = QVariant(QIcon(':/images/jobs.svg'))
self.running_icon = QVariant(QIcon(':/images/exec.svg'))
self.error_icon = QVariant(QIcon(':/images/dialog_error.svg'))
self.done_icon = QVariant(QIcon(':/images/ok.svg'))
self.jobs = []
self.server = Server()
self.add_job = Dispatcher(self._add_job)

View File

@ -35,7 +35,7 @@ class Distribution(object):
('ImageMagick', '6.3.5', 'imagemagick', 'imagemagick', 'ImageMagick'),
('xdg-utils', '1.0.2', 'xdg-utils', 'xdg-utils', 'xdg-utils'),
('dbus-python', '0.82.2', 'dbus-python', 'python-dbus', 'dbus-python'),
('lxml', '1.3.3', 'lxml', 'python-lxml', 'python-lxml'),
('lxml', '2.0.5', 'lxml', 'python-lxml', 'python-lxml'),
('help2man', '1.36.4', 'help2man', 'help2man', 'help2man'),
]