From 41a938aef021b963397347f4355c2ac109dd3b8f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Aug 2008 12:11:03 -0700 Subject: [PATCH] IGN:Working html2oeb --- src/calibre/__init__.py | 3 + src/calibre/ebooks/epub/__init__.py | 1 + src/calibre/ebooks/epub/traverse.py | 218 --------------------------- src/calibre/ebooks/html.py | 61 ++++++-- src/calibre/gui2/jobs2.py | 24 +-- src/calibre/trac/plugins/download.py | 2 +- 6 files changed, 63 insertions(+), 246 deletions(-) delete mode 100644 src/calibre/ebooks/epub/traverse.py diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index e7bc7dd472..2703431835 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -268,6 +268,9 @@ class LoggingInterface: self.__logger = logger def setup_cli_handler(self, verbosity): + for handler in self.__logger.handlers: + if isinstance(handler, logging.StreamHandler): + return if os.environ.get('CALIBRE_WORKER', None) is not None and self.__logger.handlers: return stream = sys.stdout diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index ced1d268af..45d5d44296 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -19,6 +19,7 @@ def config(defaults=None): c.update(common_config()) c.remove_opt('output') + c.remove_opt('zip') c.add_opt('output', ['-o', '--output'], default=None, help=_('The output EPUB file. If not specified, it is derived from the input file name.')) diff --git a/src/calibre/ebooks/epub/traverse.py b/src/calibre/ebooks/epub/traverse.py deleted file mode 100644 index 6f942de3b9..0000000000 --- a/src/calibre/ebooks/epub/traverse.py +++ /dev/null @@ -1,218 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -''' -Recursively parse HTML files to find all linked files. See :function:`traverse`. -''' - -import sys, os, re -from urlparse import urlparse -from urllib import unquote -from calibre import unicode_path -from calibre.ebooks.chardet import xml_to_unicode - -class Link(object): - ''' - Represents a link in a HTML file. - ''' - - @classmethod - def url_to_local_path(cls, url, base): - path = url.path - if os.path.isabs(path): - return path - return os.path.abspath(os.path.join(base, path)) - - def __init__(self, url, base): - ''' - :param url: The url this link points to. Must be an unquoted unicode string. - :param base: The base directory that relative URLs are with respect to. - Must be a unicode string. - ''' - assert isinstance(url, unicode) and isinstance(base, unicode) - self.url = url - self.parsed_url = urlparse(unquote(self.url)) - self.is_local = self.parsed_url.scheme in ('', 'file') - self.is_internal = self.is_local and not bool(self.parsed_url.path) - self.path = None - self.fragment = self.parsed_url.fragment - if self.is_local and not self.is_internal: - self.path = self.url_to_local_path(self.parsed_url, base) - - def __hash__(self): - if self.path is None: - return hash(self.url) - return hash(self.path) - - def __eq__(self, other): - return self.path == getattr(other, 'path', other) - - def __str__(self): - return u'Link: %s --> %s'%(self.url, self.path) - - -class IgnoreFile(Exception): - - def __init__(self, msg, errno): - Exception.__init__(self, msg) - self.doesnt_exist = errno == 2 - self.errno = errno - -class HTMLFile(object): - ''' - Contains basic information about an HTML file. This - includes a list of links to other files as well as - the encoding of each file. Also tries to detect if the file is not a HTML - file in which case :member:`is_binary` is set to True. - - The encoding of the file is available as :member:`encoding`. - ''' - - HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE) - LINK_PAT = re.compile( - r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P[^"]+)")|(?:\'(?P[^\']+)\')|(?P[^\s]+))', - re.DOTALL|re.IGNORECASE) - - def __init__(self, path_to_html_file, level, encoding, verbose): - ''' - :param level: The level of this file. Should be 0 for the root file. - :param encoding: Use `encoding` to decode HTML. - ''' - self.path = unicode_path(path_to_html_file, abs=True) - self.base = os.path.dirname(self.path) - self.level = level - self.links = [] - - try: - with open(self.path, 'rb') as f: - src = f.read() - except IOError, err: - msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err)) - if level == 0: - raise IOError(msg) - raise IgnoreFile(msg, err.errno) - - self.is_binary = not bool(self.HTML_PAT.search(src[:1024])) - - if not self.is_binary: - if encoding is None: - encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1] - self.encoding = encoding - - src = src.decode(encoding, 'replace') - self.find_links(src) - - - - def __eq__(self, other): - return self.path == getattr(other, 'path', other) - - def __str__(self): - return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path) - - def __repr__(self): - return str(self) - - - def find_links(self, src): - for match in self.LINK_PAT.finditer(src): - url = None - for i in ('url1', 'url2', 'url3'): - url = match.group(i) - if url: - break - link = self.resolve(url) - if link not in self.links: - self.links.append(link) - - def resolve(self, url): - return Link(url, self.base) - - -def depth_first(root, flat, visited=set([])): - yield root - visited.add(root) - for link in root.links: - if link.path is not None and link not in visited: - try: - index = flat.index(link) - except ValueError: # Can happen if max_levels is used - continue - hf = flat[index] - if hf not in visited: - yield hf - visited.add(hf) - for hf in depth_first(hf, flat, visited): - if hf not in visited: - yield hf - visited.add(hf) - - -def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None): - ''' - Recursively traverse all links in the HTML file. - - :param max_levels: Maximum levels of recursion. Must be non-negative. 0 - implies that no links in the root HTML file are followed. - :param encoding: Specify character encoding of HTML files. If `None` it is - auto-detected. - :return: A pair of lists (breadth_first, depth_first). Each list contains - :class:`HTMLFile` objects. - ''' - assert max_levels >= 0 - level = 0 - flat = [HTMLFile(path_to_html_file, level, encoding, verbose)] - next_level = list(flat) - while level < max_levels and len(next_level) > 0: - level += 1 - nl = [] - for hf in next_level: - rejects = [] - for link in hf.links: - if link.path is None or link.path in flat: - continue - try: - nf = HTMLFile(link.path, level, encoding, verbose) - nl.append(nf) - flat.append(nf) - except IgnoreFile, err: - rejects.append(link) - if not err.doesnt_exist or verbose > 1: - print str(err) - for link in rejects: - hf.links.remove(link) - - next_level = list(nl) - - return flat, list(depth_first(flat[0], flat)) - - -def opf_traverse(opf_reader, verbose=0, encoding=None): - ''' - Return a list of :class:`HTMLFile` objects in the order specified by the - `` element of the OPF. - - :param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance. - :param encoding: Specify character encoding of HTML files. If `None` it is - auto-detected. - ''' - if not opf_reader.spine: - raise ValueError('OPF does not have a spine') - flat = [] - for path in opf_reader.spine.items(): - if path not in flat: - flat.append(os.path.abspath(path)) - flat = [HTMLFile(path, 0, encoding, verbose) for path in flat] - return flat - - - -if __name__ == '__main__': - breadth_first, depth_first = traverse(sys.argv[1], verbose=2) - print 'Breadth first...' - for f in breadth_first: print f - print '\n\nDepth first...' - for f in depth_first: print f - diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index 9b0345a799..fc0dd4e472 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -284,7 +284,7 @@ class PreProcessor(object): return html -class Parser(PreProcessor): +class Parser(PreProcessor, LoggingInterface): ENCODING_PATS = [re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE), re.compile(r'', re.IGNORECASE)] @@ -299,17 +299,39 @@ class Parser(PreProcessor): self.resource_map = resource_map self.htmlfiles = htmlfiles self.resource_dir = os.path.join(tdir, 'resources') + save_counter = 1 + self.htmlfile_map = {} + for f in self.htmlfiles: + name = os.path.basename(f.path) + if name in self.htmlfile_map.values(): + name = os.path.splitext(name)[0] + '_cr_%d'%save_counter + os.path.splitext(name)[1] + save_counter += 1 + self.htmlfile_map[f.path] = name self.parse_html() self.root.rewrite_links(self.rewrite_links, resolve_base_href=False) + def save(self): + ''' + Save processed HTML into the content directory. + Should be called after all HTML processing is finished. + ''' + with open(os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]), 'wb') as f: + f.write(html.tostring(self.root, + encoding='utf-8', method='xml', + include_meta_content_type=True, + pretty_print=self.opts.pretty_print) + ) + return f.name + + def parse_html(self): ''' Create lxml ElementTree from HTML ''' self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:])) src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace') src = self.preprocess(src) # lxml chokes on unicode input when it contains encoding declarations - for pat in self.ENCODING_PATS: + for pat in self.ENCODING_PATS: src = pat.sub('', src) try: self.root = html.document_fromstring(src) @@ -350,7 +372,7 @@ class Parser(PreProcessor): if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path): return olink if link.path in self.htmlfiles: - return os.path.basename(link.path) + return self.htmlfile_map[link.path] if link.path in self.resource_map.keys(): return self.resource_map[link.path] name = os.path.basename(link.path) @@ -358,7 +380,7 @@ class Parser(PreProcessor): name += ('_%d'%len(self.resource_map)) + ext shutil.copyfile(link.path, os.path.join(self.resource_dir, name)) name = 'resources/' + name - self.resource_map[link.path] = name + self.resource_map[link.path] = name return name def extract_css(self): @@ -437,6 +459,8 @@ def config(defaults=None): help=_('The output directory. Default is the current directory.')) c.add_opt('encoding', ['--encoding'], default=None, help=_('Character encoding for HTML files. Default is to auto detect.')) + c.add_opt('zip', ['--zip'], default=False, + help=_('Create the output in a zip file. If this option is specified, the --output should be the name of a file not a directory.')) traversal = c.add_group('traversal', _('Control the following of links in HTML files.')) traversal('breadth_first', ['--breadth-first'], default=False, @@ -453,6 +477,8 @@ def config(defaults=None): debug = c.add_group('debug', _('Options useful for debugging')) debug('verbose', ['-v', '--verbose'], default=0, action='count', help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.')) + debug('pretty_print', ['--pretty-print'], default=False, + help=_('Output HTML is "pretty printed" for easier parsing by humans')) return c @@ -487,7 +513,6 @@ def get_filelist(htmlfile, opts): print '\tFound files...' for f in filelist: print '\t\t', f - return opf, filelist def parse_content(filelist, opts): @@ -499,9 +524,10 @@ def parse_content(filelist, opts): os.makedirs(rdir) resource_map = {} for htmlfile in filelist: - Parser(htmlfile, opts, os.path.join(opts.output, 'content'), + p = Parser(htmlfile, opts, os.path.join(opts.output, 'content'), resource_map, filelist) - return resource_map + p.save() + return resource_map, p.htmlfile_map def merge_metadata(htmlfile, opf, opts): if opf: @@ -519,23 +545,27 @@ def merge_metadata(htmlfile, opf, opts): mi.title = os.path.splitext(os.path.basename(htmlfile))[0] if not mi.authors: mi.authors = [_('Unknown')] + return mi def create_metadata(basepath, mi, filelist, resources): mi = OPFCreator(basepath, mi) - entries = [(f.path, None) for f in filelist] + [(f, None) for f in resources] + entries = [('content/'+f, None) for f in filelist] + [(f, None) for f in resources] mi.create_manifest(entries) - mi.create_spine([f.path for f in filelist]) + mi.create_spine(['content/'+f for f in filelist]) return mi def create_dir(htmlfile, opts): opf, filelist = get_filelist(htmlfile, opts) mi = merge_metadata(htmlfile, opf, opts) - resources = [os.path.join(opts.output, 'content', f) for f in parse_content(filelist, opts).values()] + resource_map, htmlfile_map = parse_content(filelist, opts) + resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()] if opf.cover and os.access(opf.cover, os.R_OK): - cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)) + cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[-1]) shutil.copyfile(opf.cover, cpath) resources.append(cpath) - mi = create_metadata(opts.output, mi, filelist, resources) + mi.cover = cpath + spine = [htmlfile_map[f.path] for f in filelist] + mi = create_metadata(opts.output, mi, spine, resources) with open(os.path.join(opts.output, 'metadata.opf'), 'wb') as f: mi.render(f) print 'Open ebook created in', opts.output @@ -560,11 +590,12 @@ def main(args=sys.argv): return 1 htmlfile = args[1] - create_dir(htmlfile, opts) + if opts.zip: + create_oebzip(htmlfile, opts) + else: + create_dir(htmlfile, opts) return 0 if __name__ == '__main__': sys.exit(main()) - - diff --git a/src/calibre/gui2/jobs2.py b/src/calibre/gui2/jobs2.py index 9b30a3190e..109c4eaa33 100644 --- a/src/calibre/gui2/jobs2.py +++ b/src/calibre/gui2/jobs2.py @@ -19,13 +19,13 @@ NONE = QVariant() class JobManager(QAbstractTableModel): - wait_icon = QVariant(QIcon(':/images/jobs.svg')) - running_icon = QVariant(QIcon(':/images/exec.svg')) - error_icon = QVariant(QIcon(':/images/dialog_error.svg')) - done_icon = QVariant(QIcon(':/images/ok.svg')) - def __init__(self): QAbstractTableModel.__init__(self) + self.wait_icon = QVariant(QIcon(':/images/jobs.svg')) + self.running_icon = QVariant(QIcon(':/images/exec.svg')) + self.error_icon = QVariant(QIcon(':/images/dialog_error.svg')) + self.done_icon = QVariant(QIcon(':/images/ok.svg')) + self.jobs = [] self.server = Server() self.add_job = Dispatcher(self._add_job) @@ -42,13 +42,13 @@ class JobManager(QAbstractTableModel): def headerData(self, section, orientation, role): if role != Qt.DisplayRole: return NONE - if orientation == Qt.Horizontal: + if orientation == Qt.Horizontal: if section == 0: text = _("Job") elif section == 1: text = _("Status") elif section == 2: text = _("Progress") elif section == 3: text = _('Running time') return QVariant(text) - else: + else: return QVariant(section+1) def data(self, index, role): @@ -58,7 +58,7 @@ class JobManager(QAbstractTableModel): row, col = index.row(), index.column() job = self.jobs[row] - if role == Qt.DisplayRole: + if role == Qt.DisplayRole: if col == 0: desc = job.description if not desc: @@ -145,7 +145,7 @@ class JobManager(QAbstractTableModel): return True return False - def run_job(self, done, func, args=[], kwargs={}, + def run_job(self, done, func, args=[], kwargs={}, description=None): job = ParallelJob(func, done, self, args=args, kwargs=kwargs, description=description) @@ -159,15 +159,15 @@ class JobManager(QAbstractTableModel): def kill_job(self, row, view): job = self.jobs[row] if isinstance(job, DeviceJob): - error_dialog(view, _('Cannot kill job'), + error_dialog(view, _('Cannot kill job'), _('Cannot kill jobs that communicate with the device')).exec_() return if job.has_run: - error_dialog(view, _('Cannot kill job'), + error_dialog(view, _('Cannot kill job'), _('Job has already run')).exec_() return if not job.is_running: - error_dialog(view, _('Cannot kill job'), + error_dialog(view, _('Cannot kill job'), _('Cannot kill waiting job')).exec_() return diff --git a/src/calibre/trac/plugins/download.py b/src/calibre/trac/plugins/download.py index 15ee30b09a..d2667fcb38 100644 --- a/src/calibre/trac/plugins/download.py +++ b/src/calibre/trac/plugins/download.py @@ -35,7 +35,7 @@ class Distribution(object): ('ImageMagick', '6.3.5', 'imagemagick', 'imagemagick', 'ImageMagick'), ('xdg-utils', '1.0.2', 'xdg-utils', 'xdg-utils', 'xdg-utils'), ('dbus-python', '0.82.2', 'dbus-python', 'python-dbus', 'dbus-python'), - ('lxml', '1.3.3', 'lxml', 'python-lxml', 'python-lxml'), + ('lxml', '2.0.5', 'lxml', 'python-lxml', 'python-lxml'), ('help2man', '1.36.4', 'help2man', 'help2man', 'help2man'), ]